diff --git a/.codespell/requirements.txt b/.codespell/requirements.txt index 407f17489c6..ddff454685c 100644 --- a/.codespell/requirements.txt +++ b/.codespell/requirements.txt @@ -1 +1 @@ -codespell==2.2.4 +codespell==2.2.5 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 43547a431af..3778d032623 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: sudo apt-get install tcl8.6 tclx ./runtest --verbose --tags -slow --dump-logs - name: module api test - run: ./runtest-moduleapi --verbose --dump-logs + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs - name: validate commands.def up to date run: | touch src/commands/ping.json @@ -31,13 +31,13 @@ jobs: - uses: actions/checkout@v3 - name: make # build with TLS module just for compilation coverage - run: make SANITIZER=address REDIS_CFLAGS='-Werror' BUILD_TLS=module + run: make SANITIZER=address REDIS_CFLAGS='-Werror -DDEBUG_ASSERTIONS' BUILD_TLS=module - name: testprep run: sudo apt-get install tcl8.6 tclx -y - name: test run: ./runtest --verbose --tags -slow --dump-logs - name: module api test - run: ./runtest-moduleapi --verbose --dump-logs + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs build-debian-old: runs-on: ubuntu-latest diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index dc7413e59c4..fc92dec2182 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -22,12 +22,12 @@ jobs: uses: actions/checkout@v3 - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml new file mode 100644 index 00000000000..0237c8739fb --- /dev/null +++ b/.github/workflows/coverity.yml @@ -0,0 +1,32 @@ +# Creates and uploads a Coverity build on a schedule +name: Coverity Scan +on: + schedule: + # Run once daily, since below 500k LOC can have 21 builds per week, per https://scan.coverity.com/faq#frequency + - cron: '0 0 * * *' + # Support manual execution + workflow_dispatch: +jobs: + coverity: + if: github.repository == 'redis/redis' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@main + - name: Download and extract the Coverity Build Tool + run: | + wget -q https://scan.coverity.com/download/cxx/linux64 --post-data "token=${{ secrets.COVERITY_SCAN_TOKEN }}&project=redis-unstable" -O cov-analysis-linux64.tar.gz + mkdir cov-analysis-linux64 + tar xzf cov-analysis-linux64.tar.gz --strip 1 -C cov-analysis-linux64 + - name: Install Redis dependencies + run: sudo apt install -y gcc tcl8.6 tclx procps libssl-dev + - name: Build with cov-build + run: cov-analysis-linux64/bin/cov-build --dir cov-int make + - name: Upload the result + run: | + tar czvf cov-int.tgz cov-int + curl \ + --form project=redis-unstable \ + --form email=${{ secrets.COVERITY_SCAN_EMAIL }} \ + --form token=${{ secrets.COVERITY_SCAN_TOKEN }} \ + --form file=@cov-int.tgz \ + https://scan.coverity.com/builds diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 72720e6dd43..8e382ec80a3 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -60,7 +60,7 @@ jobs: run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} @@ -104,7 +104,7 @@ jobs: run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} @@ -144,7 +144,7 @@ jobs: run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} @@ -181,7 +181,7 @@ jobs: run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} @@ -222,7 +222,7 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'modules') run: | make -C tests/modules 32bit # the script below doesn't have an argument, we must build manually ahead of time - ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} @@ -267,7 +267,7 @@ jobs: - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') run: | - ./runtest-moduleapi --verbose --dump-logs --tls --dump-logs ${{github.event.inputs.test_args}} + CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs --tls --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: | @@ -311,7 +311,7 @@ jobs: - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') run: | - ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: | @@ -489,7 +489,7 @@ jobs: sudo apt-get install tcl8.6 tclx valgrind -y - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --valgrind --no-latency --verbose --clients 1 --timeout 2400 --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --valgrind --no-latency --verbose --clients 1 --timeout 2400 --dump-logs ${{github.event.inputs.test_args}} - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | @@ -554,7 +554,7 @@ jobs: sudo apt-get install tcl8.6 tclx valgrind -y - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --valgrind --no-latency --verbose --clients 1 --timeout 2400 --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --valgrind --no-latency --verbose --clients 1 --timeout 2400 --dump-logs ${{github.event.inputs.test_args}} - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | @@ -587,7 +587,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make SANITIZER=address REDIS_CFLAGS='-DREDIS_TEST -Werror' + run: make SANITIZER=address REDIS_CFLAGS='-DREDIS_TEST -Werror -DDEBUG_ASSERTIONS' - name: testprep run: | sudo apt-get update @@ -597,7 +597,7 @@ jobs: run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} @@ -644,7 +644,7 @@ jobs: run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} @@ -687,7 +687,7 @@ jobs: run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} @@ -732,7 +732,7 @@ jobs: - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') run: | - ./runtest-moduleapi --verbose --dump-logs --tls-module --dump-logs ${{github.event.inputs.test_args}} + CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs --tls-module --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: | @@ -779,7 +779,7 @@ jobs: - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') run: | - ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: | @@ -813,10 +813,10 @@ jobs: run: make REDIS_CFLAGS='-Werror' - name: test if: true && !contains(github.event.inputs.skiptests, 'redis') - run: ./runtest --accurate --verbose --verbose --clients 1 --no-latency --dump-logs ${{github.event.inputs.test_args}} + run: ./runtest --accurate --verbose --clients 1 --no-latency --dump-logs ${{github.event.inputs.test_args}} - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --verbose --verbose --clients 1 --no-latency --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --clients 1 --no-latency --dump-logs ${{github.event.inputs.test_args}} test-macos-latest-sentinel: runs-on: macos-latest @@ -870,45 +870,19 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - test-freebsd: - runs-on: macos-12 + build-macos: + strategy: + matrix: + os: [macos-11, macos-13] + runs-on: ${{ matrix.os }} if: | (github.event_name == 'workflow_dispatch' || (github.event_name != 'workflow_dispatch' && github.repository == 'redis/redis')) && - !contains(github.event.inputs.skipjobs, 'freebsd') && !(contains(github.event.inputs.skiptests, 'redis') && contains(github.event.inputs.skiptests, 'modules')) + !contains(github.event.inputs.skipjobs, 'macos') timeout-minutes: 14400 steps: - - name: prep - if: github.event_name == 'workflow_dispatch' - run: | - echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV - echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV - echo "skipjobs: ${{github.event.inputs.skipjobs}}" - echo "skiptests: ${{github.event.inputs.skiptests}}" - echo "test_args: ${{github.event.inputs.test_args}}" - echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}" - - uses: actions/checkout@v3 - with: - repository: ${{ env.GITHUB_REPOSITORY }} - ref: ${{ env.GITHUB_HEAD_REF }} - - name: test - uses: vmactions/freebsd-vm@v0.3.1 + - uses: maxim-lobanov/setup-xcode@v1 with: - usesh: true - sync: rsync - copyback: false - prepare: pkg install -y bash gmake lang/tcl86 lang/tclx - run: > - gmake || exit 1 ; - if echo "${{github.event.inputs.skiptests}}" | grep -vq redis ; then ./runtest --verbose --timeout 2400 --no-latency --dump-logs ${{github.event.inputs.test_args}} || exit 1 ; fi ; - if echo "${{github.event.inputs.skiptests}}" | grep -vq modules ; then MAKE=gmake ./runtest-moduleapi --verbose --timeout 2400 --no-latency --dump-logs ${{github.event.inputs.test_args}} || exit 1 ; fi ; - - test-freebsd-sentinel: - runs-on: macos-12 - if: | - (github.event_name == 'workflow_dispatch' || (github.event_name != 'workflow_dispatch' && github.repository == 'redis/redis')) && - !contains(github.event.inputs.skipjobs, 'freebsd') && !contains(github.event.inputs.skiptests, 'sentinel') - timeout-minutes: 14400 - steps: + xcode-version: latest - name: prep if: github.event_name == 'workflow_dispatch' run: | @@ -922,22 +896,14 @@ jobs: with: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - - name: test - uses: vmactions/freebsd-vm@v0.3.1 - with: - usesh: true - sync: rsync - copyback: false - prepare: pkg install -y bash gmake lang/tcl86 lang/tclx - run: > - gmake || exit 1 ; - if echo "${{github.event.inputs.skiptests}}" | grep -vq sentinel ; then ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} || exit 1 ; fi ; + - name: make + run: make REDIS_CFLAGS='-Werror -DREDIS_TEST' - test-freebsd-cluster: + test-freebsd: runs-on: macos-12 if: | (github.event_name == 'workflow_dispatch' || (github.event_name != 'workflow_dispatch' && github.repository == 'redis/redis')) && - !contains(github.event.inputs.skipjobs, 'freebsd') && !contains(github.event.inputs.skiptests, 'cluster') + !contains(github.event.inputs.skipjobs, 'freebsd') timeout-minutes: 14400 steps: - name: prep @@ -945,24 +911,21 @@ jobs: run: | echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV - echo "skipjobs: ${{github.event.inputs.skipjobs}}" - echo "skiptests: ${{github.event.inputs.skiptests}}" - echo "test_args: ${{github.event.inputs.test_args}}" - echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}" - uses: actions/checkout@v3 with: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: test - uses: vmactions/freebsd-vm@v0.3.1 + uses: cross-platform-actions/action@v0.22.0 with: - usesh: true - sync: rsync - copyback: false - prepare: pkg install -y bash gmake lang/tcl86 lang/tclx - run: > - gmake || exit 1 ; - if echo "${{github.event.inputs.skiptests}}" | grep -vq cluster ; then ./runtest-cluster ${{github.event.inputs.cluster_test_args}} || exit 1 ; fi ; + operating_system: freebsd + environment_variables: MAKE + version: 13.2 + shell: bash + run: | + sudo pkg install -y bash gmake lang/tcl86 lang/tclx + gmake + ./runtest --single unit/keyspace --single unit/auth --single unit/networking --single unit/protocol test-alpine-jemalloc: runs-on: ubuntu-latest @@ -995,7 +958,7 @@ jobs: run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} @@ -1034,7 +997,7 @@ jobs: run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} @@ -1071,7 +1034,7 @@ jobs: run: ./runtest --log-req-res --no-latency --dont-clean --force-resp3 --tags -slow --verbose --dump-logs ${{github.event.inputs.test_args}} - name: module api test if: true && !contains(github.event.inputs.skiptests, 'modules') - run: ./runtest-moduleapi --log-req-res --no-latency --dont-clean --force-resp3 --dont-pre-clean --verbose --dump-logs ${{github.event.inputs.test_args}} + run: CFLAGS='-Werror' ./runtest-moduleapi --log-req-res --no-latency --dont-clean --force-resp3 --dont-pre-clean --verbose --dump-logs ${{github.event.inputs.test_args}} - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel --log-req-res --dont-clean --force-resp3 ${{github.event.inputs.cluster_test_args}} diff --git a/.github/workflows/external.yml b/.github/workflows/external.yml index 15a9afb6e41..0c884053b76 100644 --- a/.github/workflows/external.yml +++ b/.github/workflows/external.yml @@ -23,6 +23,7 @@ jobs: run: | ./runtest \ --host 127.0.0.1 --port 6379 \ + --verbose \ --tags -slow - name: Archive redis log if: ${{ failure() }} @@ -49,6 +50,7 @@ jobs: run: | ./runtest \ --host 127.0.0.1 --port 6379 \ + --verbose \ --cluster-mode \ --tags -slow - name: Archive redis log @@ -73,6 +75,7 @@ jobs: run: | ./runtest \ --host 127.0.0.1 --port 6379 \ + --verbose \ --tags "-slow -needs:debug" - name: Archive redis log if: ${{ failure() }} diff --git a/.github/workflows/reply-schemas-linter.yml b/.github/workflows/reply-schemas-linter.yml index 13fc8ab88d0..6893bb3dca9 100644 --- a/.github/workflows/reply-schemas-linter.yml +++ b/.github/workflows/reply-schemas-linter.yml @@ -14,7 +14,7 @@ jobs: steps: - uses: actions/checkout@v3 - name: Setup nodejs - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 - name: Install packages run: npm install ajv - name: linter diff --git a/.github/workflows/spell-check.yml b/.github/workflows/spell-check.yml index 53360741291..77f5437ca2b 100644 --- a/.github/workflows/spell-check.yml +++ b/.github/workflows/spell-check.yml @@ -19,7 +19,7 @@ jobs: uses: actions/checkout@v3 - name: pip cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index d66769b984e..648a4926856 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -26,7 +26,7 @@ Examples of unacceptable behavior include: advances of any kind * Trolling, insulting or derogatory comments, and personal or political attacks * Public or private harassment -* Publishing others’ private information, such as a physical or email +* Publishing others' private information, such as a physical or email address, without their explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting @@ -89,7 +89,7 @@ Attribution This Code of Conduct is adapted from the Contributor Covenant, version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. -Community Impact Guidelines were inspired by Mozilla’s code of conduct +Community Impact Guidelines were inspired by Mozilla's code of conduct enforcement ladder. For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 56b71834d6b..4ae73e3b338 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,20 +1,82 @@ -Note: by contributing code to the Redis project in any form, including sending -a pull request via Github, a code fragment or patch via private email or -public discussion groups, you agree to release your code under the terms -of the BSD license that you can find in the COPYING file included in the Redis -source distribution. You will include BSD license in the COPYING file within -each source file that you contribute. +By contributing code to the Redis project in any form you agree to the Redis Software Grant and +Contributor License Agreement attached below. Only contributions made under the Redis Software Grant +and Contributor License Agreement may be accepted by Redis, and any contribution is subject to the +terms of the Redis dual-license under RSALv2/SSPLv1 as described in the LICENSE.txt file included in +the Redis source distribution. + +# REDIS SOFTWARE GRANT AND CONTRIBUTOR LICENSE AGREEMENT + +To specify the intellectual property license granted in any Contribution, Redis Ltd., ("**Redis**") +requires a Software Grant and Contributor License Agreement ("**Agreement**"). This Agreement is for +your protection as a contributor as well as the protection of Redis and its users; it does not +change your rights to use your own Contribution for any other purpose. + +By making any Contribution, You accept and agree to the following terms and conditions for the +Contribution. Except for the license granted in this Agreement to Redis and the recipients of the +software distributed by Redis, You reserve all right, title, and interest in and to Your +Contribution. + +1. **Definitions** + + 1.1. "**You**" (or "**Your**") means the copyright owner or legal entity authorized by the + copyright owner that is entering into this Agreement with Redis. For legal entities, the entity + making a Contribution and all other entities that Control, are Controlled by, or are under + common Control with that entity are considered to be a single contributor. For the purposes of + this definition, "**Control**" means (i) the power, direct or indirect, to cause the direction + or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty + percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + + 1.2. "**Contribution**" means the code, documentation, or any original work of authorship, + including any modifications or additions to an existing work described above. + +2. "**Work**" means any software project stewarded by Redis. + +3. **Grant of Copyright License**. Subject to the terms and conditions of this Agreement, You grant + to Redis and to the recipients of the software distributed by Redis a perpetual, worldwide, + non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare + derivative works of, publicly display, publicly perform, sublicense, and distribute Your + Contribution and such derivative works. + +4. **Grant of Patent License**. Subject to the terms and conditions of this Agreement, You grant to + Redis and to the recipients of the software distributed by Redis a perpetual, worldwide, + non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent + license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable by You that are necessarily + infringed by Your Contribution alone or by a combination of Your Contribution with the Work to + which such Contribution was submitted. If any entity institutes patent litigation against You or + any other entity (including a cross-claim or counterclaim in a lawsuit) alleging that your + Contribution, or the Work to which you have contributed, constitutes a direct or contributory + patent infringement, then any patent licenses granted to the claimant entity under this Agreement + for that Contribution or Work terminate as of the date such litigation is filed. + +5. **Representations and Warranties**. You represent and warrant that: (i) You are legally entitled + to grant the above licenses; and (ii) if You are an entity, each employee or agent designated by + You is authorized to submit the Contribution on behalf of You; and (iii) your Contribution is + Your original work, and that it will not infringe on any third party's intellectual property + right(s). + +6. **Disclaimer**. You are not expected to provide support for Your Contribution, except to the + extent You desire to provide support. You may provide support for free, for a fee, or not at all. + Unless required by applicable law or agreed to in writing, You provide Your Contribution on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, + including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, + MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. + +7. **Enforceability**. Nothing in this Agreement will be construed as creating any joint venture, + employment relationship, or partnership between You and Redis. If any provision of this Agreement + is held to be unenforceable, the remaining provisions of this Agreement will not be affected. + This represents the entire agreement between You and Redis relating to the Contribution. # IMPORTANT: HOW TO USE REDIS GITHUB ISSUES -Github issues SHOULD ONLY BE USED to report bugs, and for DETAILED feature -requests. Everything else belongs to the Redis Google Group: +GitHub issues SHOULD ONLY BE USED to report bugs and for DETAILED feature +requests. Everything else should be asked on Discord: - https://groups.google.com/forum/m/#!forum/Redis-db + https://discord.com/invite/redis PLEASE DO NOT POST GENERAL QUESTIONS that are not about bugs or suspected -bugs in the Github issues system. We'll be very happy to help you and provide -all the support in the mailing list. +bugs in the GitHub issues system. We'll be delighted to help you and provide +all the support on Discord. There is also an active community of Redis users at Stack Overflow: @@ -33,24 +95,24 @@ straight away: if your feature is not a conceptual fit you'll lose a lot of time writing the code without any reason. Start by posting in the mailing list and creating an issue at Github with the description of, exactly, what you want to accomplish and why. Use cases are important for features to be accepted. -Here you'll see if there is consensus about your idea. +Here you can see if there is consensus about your idea. 2. If in step 1 you get an acknowledgment from the project leaders, use the following procedure to submit a patch: - a. Fork Redis on github ( https://docs.github.com/en/github/getting-started-with-github/fork-a-repo ) + a. Fork Redis on GitHub ( https://docs.github.com/en/github/getting-started-with-github/fork-a-repo ) b. Create a topic branch (git checkout -b my_branch) c. Push to your branch (git push origin my_branch) - d. Initiate a pull request on github ( https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request ) + d. Initiate a pull request on GitHub ( https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request ) e. Done :) 3. Keep in mind that we are very overloaded, so issues and PRs sometimes wait -for a *very* long time. However this is not lack of interest, as the project +for a *very* long time. However this is not a lack of interest, as the project gets more and more users, we find ourselves in a constant need to prioritize certain issues/PRs over others. If you think your issue/PR is very important try to popularize it, have other users commenting and sharing their point of -view and so forth. This helps. +view, and so forth. This helps. -4. For minor fixes just open a pull request on Github. +4. For minor fixes - open a pull request on GitHub. -Thanks! +Additional information on the RSALv2/SSPLv1 dual-license is also found in the LICENSE.txt file. diff --git a/COPYING b/COPYING deleted file mode 100644 index a381681a1c2..00000000000 --- a/COPYING +++ /dev/null @@ -1,10 +0,0 @@ -Copyright (c) 2006-2020, Salvatore Sanfilippo -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Redis nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 00000000000..a60c2460490 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,733 @@ +Starting on March 20th, 2024, Redis follows a dual-licensing model with all Redis project code +contributions under version 7.4 and subsequent releases governed by the Redis Software Grant and +Contributor License Agreement. After this date, contributions are subject to the user's choice of +the Redis Source Available License v2 (RSALv2) or the Server Side Public License v1 (SSPLv1), as +follows: + + +1. Redis Source Available License 2.0 (RSALv2) Agreement +======================================================== + +Last Update: December 30, 2023 + +Acceptance +---------- + +This Agreement sets forth the terms and conditions on which the Licensor +makes available the Software. By installing, downloading, accessing, +Using, or distributing any of the Software, You agree to all of the +terms and conditions of this Agreement. + +If You are receiving the Software on behalf of Your Company, You +represent and warrant that You have the authority to agree to this +Agreement on behalf of such entity. + +The Licensor reserves the right to update this Agreement from time to +time. + +The terms below have the meanings set forth below for purposes of this +Agreement: + +Definitions +----------- + +Agreement: this Redis Source Available License 2.0 Agreement. + +Control: ownership, directly or indirectly, of substantially all the +assets of an entity, or the power to direct its management and policies +by vote, contract, or otherwise. + +License: the License as described in the License paragraph below. + +Licensor: the entity offering these terms, which includes Redis Ltd. on +behalf of itself and its subsidiaries and affiliates worldwide. + +Modify, Modified, or Modification: copy from or adapt all or part of the +work in a fashion requiring copyright permission other than making an +exact copy. The resulting work is called a Modified version of the +earlier work. + +Redis: the Redis software as described in redis.com redis.io. + +Software: certain Software components designed to work with Redis and +provided to You under this Agreement. + +Trademark: the trademarks, service marks, and any other similar rights. + +Use: anything You do with the Software requiring one of Your Licenses. + +You: the recipient of the Software, the individual or entity on whose +behalf You are agreeing to this Agreement. + +Your Company: any legal entity, sole proprietorship, or other kind of +organization that You work for, plus all organizations that have control +over, are under the control of, or are under common control with that +organization. + +Your Licenses: means all the Licenses granted to You for the Software +under this Agreement. + +License +------- + +The Licensor grants You a non-exclusive, royalty-free, worldwide, +non-sublicensable, non-transferable license to use, copy, distribute, +make available, and prepare derivative works of the Software, in each +case subject to the limitations and conditions below. + +Limitations +----------- + +You may not make the functionality of the Software or a Modified version +available to third parties as a service or distribute the Software or a +Modified version in a manner that makes the functionality of the +Software available to third parties. + +Making the functionality of the Software or Modified version available +to third parties includes, without limitation, enabling third parties to +interact with the functionality of the Software or Modified version in +distributed form or remotely through a computer network, offering a +product or service, the value of which entirely or primarily derives +from the value of the Software or Modified version, or offering a +product or service that accomplishes for users the primary purpose of +the Software or Modified version. + +You may not alter, remove, or obscure any licensing, copyright, or other +notices of the Licensor in the Software. Any use of the Licensor's +Trademarks is subject to applicable law. + +Patents +------- + +The Licensor grants You a License, under any patent claims the Licensor +can License, or becomes able to License, to make, have made, use, sell, +offer for sale, import and have imported the Software, in each case +subject to the limitations and conditions in this License. This License +does not cover any patent claims that You cause to be infringed by +Modifications or additions to the Software. If You or Your Company make +any written claim that the Software infringes or contributes to +infringement of any patent, your patent License for the Software granted +under this Agreement ends immediately. If Your Company makes such a +claim, your patent License ends immediately for work on behalf of Your +Company. + +Notices +------- + +You must ensure that anyone who gets a copy of any part of the Software +from You also gets a copy of the terms and conditions in this Agreement. + +If You modify the Software, You must include in any Modified copies of +the Software prominent notices stating that You have Modified the +Software. + +No Other Rights +--------------- + +The terms and conditions of this Agreement do not imply any Licenses +other than those expressly granted in this Agreement. + +Termination +----------- + +If You Use the Software in violation of this Agreement, such Use is not +Licensed, and Your Licenses will automatically terminate. If the +Licensor provides You with a notice of your violation, and You cease all +violations of this License no later than 30 days after You receive that +notice, Your Licenses will be reinstated retroactively. However, if You +violate this Agreement after such reinstatement, any additional +violation of this Agreement will cause your Licenses to terminate +automatically and permanently. + +No Liability +------------ + +As far as the law allows, the Software comes as is, without any +warranty or condition, and the Licensor will not be liable to You for +any damages arising out of this Agreement or the Use or nature of the +Software, under any kind of legal claim. + +Governing Law and Jurisdiction +------------------------------ + +If You are located in Asia, Pacific, Americas, or other jurisdictions +not listed below, the Agreement will be construed and enforced in all +respects in accordance with the laws of the State of California, U.S.A., +without reference to its choice of law rules. The courts located in the +County of Santa Clara, California, have exclusive jurisdiction for all +purposes relating to this Agreement. + +If You are located in Israel, the Agreement will be construed and +enforced in all respects in accordance with the laws of the State of +Israel without reference to its choice of law rules. The courts located +in the Central District of the State of Israel have exclusive +jurisdiction for all purposes relating to this Agreement. + +If You are located in Europe, United Kingdom, Middle East or Africa, the +Agreement will be construed and enforced in all respects in accordance +with the laws of England and Wales without reference to its choice of +law rules. The competent courts located in London, England, have +exclusive jurisdiction for all purposes relating to this Agreement. + + + +2. Server Side Public License (SSPL) +==================================== + + Server Side Public License + VERSION 1, OCTOBER 16, 2018 + + Copyright (c) 2018 MongoDB, Inc. + + Everyone is permitted to copy and distribute verbatim copies of this + license document, but changing it is not allowed. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to Server Side Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of + works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this + License. Each licensee is addressed as "you". "Licensees" and + "recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work in + a fashion requiring copyright permission, other than the making of an + exact copy. The resulting work is called a "modified version" of the + earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based on + the Program. + + To "propagate" a work means to do anything with it that, without + permission, would make you directly or secondarily liable for + infringement under applicable copyright law, except executing it on a + computer or modifying a private copy. Propagation includes copying, + distribution (with or without modification), making available to the + public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other + parties to make or receive copies. Mere interaction with a user through a + computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" to the + extent that it includes a convenient and prominently visible feature that + (1) displays an appropriate copyright notice, and (2) tells the user that + there is no warranty for the work (except to the extent that warranties + are provided), that licensees may convey the work under this License, and + how to view a copy of this License. If the interface presents a list of + user commands or options, such as a menu, a prominent item in the list + meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work for + making modifications to it. "Object code" means any non-source form of a + work. + + A "Standard Interface" means an interface that either is an official + standard defined by a recognized standards body, or, in the case of + interfaces specified for a particular programming language, one that is + widely used among developers working in that language. The "System + Libraries" of an executable work include anything, other than the work as + a whole, that (a) is included in the normal form of packaging a Major + Component, but which is not part of that Major Component, and (b) serves + only to enable use of the work with that Major Component, or to implement + a Standard Interface for which an implementation is available to the + public in source code form. A "Major Component", in this context, means a + major essential component (kernel, window system, and so on) of the + specific operating system (if any) on which the executable work runs, or + a compiler used to produce the work, or an object code interpreter used + to run it. + + The "Corresponding Source" for a work in object code form means all the + source code needed to generate, install, and (for an executable work) run + the object code and to modify the work, including scripts to control + those activities. However, it does not include the work's System + Libraries, or general-purpose tools or generally available free programs + which are used unmodified in performing those activities but which are + not part of the work. For example, Corresponding Source includes + interface definition files associated with source files for the work, and + the source code for shared libraries and dynamically linked subprograms + that the work is specifically designed to require, such as by intimate + data communication or control flow between those subprograms and other + parts of the work. + + The Corresponding Source need not include anything that users can + regenerate automatically from other parts of the Corresponding Source. + + The Corresponding Source for a work in source code form is that same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of + copyright on the Program, and are irrevocable provided the stated + conditions are met. This License explicitly affirms your unlimited + permission to run the unmodified Program, subject to section 13. The + output from running a covered work is covered by this License only if the + output, given its content, constitutes a covered work. This License + acknowledges your rights of fair use or other equivalent, as provided by + copyright law. Subject to section 13, you may make, run and propagate + covered works that you do not convey, without conditions so long as your + license otherwise remains in force. You may convey covered works to + others for the sole purpose of having them make modifications exclusively + for you, or provide you with facilities for running those works, provided + that you comply with the terms of this License in conveying all + material for which you do not control copyright. Those thus making or + running the covered works for you must do so exclusively on your + behalf, under your direction and control, on terms that prohibit them + from making any copies of your copyrighted material outside their + relationship with you. + + Conveying under any other circumstances is permitted solely under the + conditions stated below. Sublicensing is not allowed; section 10 makes it + unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological + measure under any applicable law fulfilling obligations under article 11 + of the WIPO copyright treaty adopted on 20 December 1996, or similar laws + prohibiting or restricting circumvention of such measures. + + When you convey a covered work, you waive any legal power to forbid + circumvention of technological measures to the extent such circumvention is + effected by exercising rights under this License with respect to the + covered work, and you disclaim any intention to limit operation or + modification of the work as a means of enforcing, against the work's users, + your or third parties' legal rights to forbid circumvention of + technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you + receive it, in any medium, provided that you conspicuously and + appropriately publish on each copy an appropriate copyright notice; keep + intact all notices stating that this License and any non-permissive terms + added in accord with section 7 apply to the code; keep intact all notices + of the absence of any warranty; and give all recipients a copy of this + License along with the Program. You may charge any price or no price for + each copy that you convey, and you may offer support or warranty + protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to + produce it from the Program, in the form of source code under the terms + of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified it, + and giving a relevant date. + + b) The work must carry prominent notices stating that it is released + under this License and any conditions added under section 7. This + requirement modifies the requirement in section 4 to "keep intact all + notices". + + c) You must license the entire work, as a whole, under this License to + anyone who comes into possession of a copy. This License will therefore + apply, along with any applicable section 7 additional terms, to the + whole of the work, and all its parts, regardless of how they are + packaged. This License gives no permission to license the work in any + other way, but it does not invalidate such permission if you have + separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your work + need not make them do so. + + A compilation of a covered work with other separate and independent + works, which are not by their nature extensions of the covered work, and + which are not combined with it such as to form a larger program, in or on + a volume of a storage or distribution medium, is called an "aggregate" if + the compilation and its resulting copyright are not used to limit the + access or legal rights of the compilation's users beyond what the + individual works permit. Inclusion of a covered work in an aggregate does + not cause this License to apply to the other parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms of + sections 4 and 5, provided that you also convey the machine-readable + Corresponding Source under the terms of this License, in one of these + ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium customarily + used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a written + offer, valid for at least three years and valid for as long as you + offer spare parts or customer support for that product model, to give + anyone who possesses the object code either (1) a copy of the + Corresponding Source for all the software in the product that is + covered by this License, on a durable physical medium customarily used + for software interchange, for a price no more than your reasonable cost + of physically performing this conveying of source, or (2) access to + copy the Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This alternative is + allowed only occasionally and noncommercially, and only if you received + the object code with such an offer, in accord with subsection 6b. + + d) Convey the object code by offering access from a designated place + (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to copy + the object code is a network server, the Corresponding Source may be on + a different server (operated by you or a third party) that supports + equivalent copying facilities, provided you maintain clear directions + next to the object code saying where to find the Corresponding Source. + Regardless of what server hosts the Corresponding Source, you remain + obligated to ensure that it is available for as long as needed to + satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided you + inform other peers where the object code and Corresponding Source of + the work are being offered to the general public at no charge under + subsection 6d. + + A separable portion of the object code, whose source code is excluded + from the Corresponding Source as a System Library, need not be included + in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any + tangible personal property which is normally used for personal, family, + or household purposes, or (2) anything designed or sold for incorporation + into a dwelling. In determining whether a product is a consumer product, + doubtful cases shall be resolved in favor of coverage. For a particular + product received by a particular user, "normally used" refers to a + typical or common use of that class of product, regardless of the status + of the particular user or of the way in which the particular user + actually uses, or expects or is expected to use, the product. A product + is a consumer product regardless of whether the product has substantial + commercial, industrial or non-consumer uses, unless such uses represent + the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, + procedures, authorization keys, or other information required to install + and execute modified versions of a covered work in that User Product from + a modified version of its Corresponding Source. The information must + suffice to ensure that the continued functioning of the modified object + code is in no case prevented or interfered with solely because + modification has been made. + + If you convey an object code work under this section in, or with, or + specifically for use in, a User Product, and the conveying occurs as part + of a transaction in which the right of possession and use of the User + Product is transferred to the recipient in perpetuity or for a fixed term + (regardless of how the transaction is characterized), the Corresponding + Source conveyed under this section must be accompanied by the + Installation Information. But this requirement does not apply if neither + you nor any third party retains the ability to install modified object + code on the User Product (for example, the work has been installed in + ROM). + + The requirement to provide Installation Information does not include a + requirement to continue to provide support service, warranty, or updates + for a work that has been modified or installed by the recipient, or for + the User Product in which it has been modified or installed. Access + to a network may be denied when the modification itself materially + and adversely affects the operation of the network or violates the + rules and protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, in + accord with this section must be in a format that is publicly documented + (and with an implementation available to the public in source code form), + and must require no special password or key for unpacking, reading or + copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this + License by making exceptions from one or more of its conditions. + Additional permissions that are applicable to the entire Program shall be + treated as though they were included in this License, to the extent that + they are valid under applicable law. If additional permissions apply only + to part of the Program, that part may be used separately under those + permissions, but the entire Program remains governed by this License + without regard to the additional permissions. When you convey a copy of + a covered work, you may at your option remove any additional permissions + from that copy, or from any part of it. (Additional permissions may be + written to require their own removal in certain cases when you modify the + work.) You may place additional permissions on material, added by you to + a covered work, for which you have or can give appropriate copyright + permission. + + Notwithstanding any other provision of this License, for material you add + to a covered work, you may (if authorized by the copyright holders of + that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some trade + names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that material + by anyone who conveys the material (or modified versions of it) with + contractual assumptions of liability to the recipient, for any + liability that these contractual assumptions directly impose on those + licensors and authors. + + All other non-permissive additional terms are considered "further + restrictions" within the meaning of section 10. If the Program as you + received it, or any part of it, contains a notice stating that it is + governed by this License along with a term that is a further restriction, + you may remove that term. If a license document contains a further + restriction but permits relicensing or conveying under this License, you + may add to a covered work material governed by the terms of that license + document, provided that the further restriction does not survive such + relicensing or conveying. + + If you add terms to a covered work in accord with this section, you must + place, in the relevant source files, a statement of the additional terms + that apply to those files, or a notice indicating where to find the + applicable terms. Additional terms, permissive or non-permissive, may be + stated in the form of a separately written license, or stated as + exceptions; the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly + provided under this License. Any attempt otherwise to propagate or modify + it is void, and will automatically terminate your rights under this + License (including any patent licenses granted under the third paragraph + of section 11). + + However, if you cease all violation of this License, then your license + from a particular copyright holder is reinstated (a) provisionally, + unless and until the copyright holder explicitly and finally terminates + your license, and (b) permanently, if the copyright holder fails to + notify you of the violation by some reasonable means prior to 60 days + after the cessation. + + Moreover, your license from a particular copyright holder is reinstated + permanently if the copyright holder notifies you of the violation by some + reasonable means, this is the first time you have received notice of + violation of this License (for any work) from that copyright holder, and + you cure the violation prior to 30 days after your receipt of the notice. + + Termination of your rights under this section does not terminate the + licenses of parties who have received copies or rights from you under + this License. If your rights have been terminated and not permanently + reinstated, you do not qualify to receive new licenses for the same + material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or run a + copy of the Program. Ancillary propagation of a covered work occurring + solely as a consequence of using peer-to-peer transmission to receive a + copy likewise does not require acceptance. However, nothing other than + this License grants you permission to propagate or modify any covered + work. These actions infringe copyright if you do not accept this License. + Therefore, by modifying or propagating a covered work, you indicate your + acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically receives + a license from the original licensors, to run, modify and propagate that + work, subject to this License. You are not responsible for enforcing + compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an + organization, or substantially all assets of one, or subdividing an + organization, or merging organizations. If propagation of a covered work + results from an entity transaction, each party to that transaction who + receives a copy of the work also receives whatever licenses to the work + the party's predecessor in interest had or could give under the previous + paragraph, plus a right to possession of the Corresponding Source of the + work from the predecessor in interest, if the predecessor has it or can + get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the rights + granted or affirmed under this License. For example, you may not impose a + license fee, royalty, or other charge for exercise of rights granted + under this License, and you may not initiate litigation (including a + cross-claim or counterclaim in a lawsuit) alleging that any patent claim + is infringed by making, using, selling, offering for sale, or importing + the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this + License of the Program or a work on which the Program is based. The work + thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims owned or + controlled by the contributor, whether already acquired or hereafter + acquired, that would be infringed by some manner, permitted by this + License, of making, using, or selling its contributor version, but do not + include claims that would be infringed only as a consequence of further + modification of the contributor version. For purposes of this definition, + "control" includes the right to grant patent sublicenses in a manner + consistent with the requirements of this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free + patent license under the contributor's essential patent claims, to make, + use, sell, offer for sale, import and otherwise run, modify and propagate + the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express + agreement or commitment, however denominated, not to enforce a patent + (such as an express permission to practice a patent or covenant not to + sue for patent infringement). To "grant" such a patent license to a party + means to make such an agreement or commitment not to enforce a patent + against the party. + + If you convey a covered work, knowingly relying on a patent license, and + the Corresponding Source of the work is not available for anyone to copy, + free of charge and under the terms of this License, through a publicly + available network server or other readily accessible means, then you must + either (1) cause the Corresponding Source to be so available, or (2) + arrange to deprive yourself of the benefit of the patent license for this + particular work, or (3) arrange, in a manner consistent with the + requirements of this License, to extend the patent license to downstream + recipients. "Knowingly relying" means you have actual knowledge that, but + for the patent license, your conveying the covered work in a country, or + your recipient's use of the covered work in a country, would infringe + one or more identifiable patents in that country that you have reason + to believe are valid. + + If, pursuant to or in connection with a single transaction or + arrangement, you convey, or propagate by procuring conveyance of, a + covered work, and grant a patent license to some of the parties receiving + the covered work authorizing them to use, propagate, modify or convey a + specific copy of the covered work, then the patent license you grant is + automatically extended to all recipients of the covered work and works + based on it. + + A patent license is "discriminatory" if it does not include within the + scope of its coverage, prohibits the exercise of, or is conditioned on + the non-exercise of one or more of the rights that are specifically + granted under this License. You may not convey a covered work if you are + a party to an arrangement with a third party that is in the business of + distributing software, under which you make payment to the third party + based on the extent of your activity of conveying the work, and under + which the third party grants, to any of the parties who would receive the + covered work from you, a discriminatory patent license (a) in connection + with copies of the covered work conveyed by you (or copies made from + those copies), or (b) primarily for and in connection with specific + products or compilations that contain the covered work, unless you + entered into that arrangement, or that patent license was granted, prior + to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting any + implied license or other defenses to infringement that may otherwise be + available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or + otherwise) that contradict the conditions of this License, they do not + excuse you from the conditions of this License. If you cannot use, + propagate or convey a covered work so as to satisfy simultaneously your + obligations under this License and any other pertinent obligations, then + as a consequence you may not use, propagate or convey it at all. For + example, if you agree to terms that obligate you to collect a royalty for + further conveying from those to whom you convey the Program, the only way + you could satisfy both those terms and this License would be to refrain + entirely from conveying the Program. + + 13. Offering the Program as a Service. + + If you make the functionality of the Program or a modified version + available to third parties as a service, you must make the Service Source + Code available via network download to everyone at no charge, under the + terms of this License. Making the functionality of the Program or + modified version available to third parties as a service includes, + without limitation, enabling third parties to interact with the + functionality of the Program or modified version remotely through a + computer network, offering a service the value of which entirely or + primarily derives from the value of the Program or modified version, or + offering a service that accomplishes for users the primary purpose of the + Program or modified version. + + "Service Source Code" means the Corresponding Source for the Program or + the modified version, and the Corresponding Source for all programs that + you use to make the Program or modified version available as a service, + including, without limitation, management software, user interfaces, + application program interfaces, automation software, monitoring software, + backup software, storage software and hosting software, all such that a + user could run an instance of the service using the Service Source Code + you make available. + + 14. Revised Versions of this License. + + MongoDB, Inc. may publish revised and/or new versions of the Server Side + Public License from time to time. Such new versions will be similar in + spirit to the present version, but may differ in detail to address new + problems or concerns. + + Each version is given a distinguishing version number. If the Program + specifies that a certain numbered version of the Server Side Public + License "or any later version" applies to it, you have the option of + following the terms and conditions either of that numbered version or of + any later version published by MongoDB, Inc. If the Program does not + specify a version number of the Server Side Public License, you may + choose any version ever published by MongoDB, Inc. + + If the Program specifies that a proxy can decide which future versions of + the Server Side Public License can be used, that proxy's public statement + of acceptance of a version permanently authorizes you to choose that + version for the Program. + + Later license versions may give you additional or different permissions. + However, no additional obligations are imposed on any author or copyright + holder as a result of your choosing to follow a later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY + APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT + HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY + OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, + THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM + IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF + ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING + WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS + THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING + ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF + THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO + LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU + OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER + PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE + POSSIBILITY OF SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided above + cannot be given local legal effect according to their terms, reviewing + courts shall apply local law that most closely approximates an absolute + waiver of all civil liability in connection with the Program, unless a + warranty or assumption of liability accompanies a copy of the Program in + return for a fee. + + END OF TERMS AND CONDITIONS diff --git a/README.md b/README.md index 4a1ce791492..ff483f9e1d7 100644 --- a/README.md +++ b/README.md @@ -217,18 +217,28 @@ You'll be able to stop and start Redis using the script named Code contributions ----------------- -Note: By contributing code to the Redis project in any form, including sending -a pull request via Github, a code fragment or patch via private email or -public discussion groups, you agree to release your code under the terms -of the BSD license that you can find in the [COPYING][1] file included in the Redis -source distribution. +By contributing code to the Redis project in any form, including sending a pull request via GitHub, +a code fragment or patch via private email or public discussion groups, you agree to release your +code under the terms of the [Redis Software Grant and Contributor License Agreement][1]. Redis software +contains contributions to the original Redis core project, which are owned by their contributors and +licensed under the 3BSD license. Any copy of that license in this repository applies only to those +contributions. Redis releases all Redis project versions from 7.4.x and thereafter under the +RSALv2/SSPL dual-license as described in the [LICENSE.txt][2] file included in the Redis source distribution. + +Please see the [CONTRIBUTING.md][1] file in this source distribution for more information. For +security bugs and vulnerabilities, please see [SECURITY.md][3]. + +[1]: https://github.com/redis/redis/blob/unstable/CONTRIBUTING.md +[2]: https://github.com/redis/redis/blob/unstable/LICENSE.txt +[3]: https://github.com/redis/redis/blob/unstable/SECURITY.md -Please see the [CONTRIBUTING.md][2] file in this source distribution for more -information. For security bugs and vulnerabilities, please see [SECURITY.md][3]. +Redis Trademarks +---------------- -[1]: https://github.com/redis/redis/blob/unstable/COPYING -[2]: https://github.com/redis/redis/blob/unstable/CONTRIBUTING.md -[3]: https://github.com/redis/redis/blob/unstable/SECURITY.md +The purpose of a trademark is to identify the goods and services of a person or company without +causing confusion. As the registered owner of its name and logo, Redis accepts certain limited uses +of its trademarks but it has requirements that must be followed as described in its Trademark +Guidelines available at: https://redis.com/legal/trademark-guidelines/. Redis internals === @@ -420,7 +430,7 @@ implementations are the following: * `lookupKeyRead()` and `lookupKeyWrite()` are used in order to get a pointer to the value associated to a given key, or `NULL` if the key does not exist. * `dbAdd()` and its higher level counterpart `setKey()` create a new key in a Redis database. * `dbDelete()` removes a key and its associated value. -* `emptyDb()` removes an entire single database or all the databases defined. +* `emptyData()` removes an entire single database or all the databases defined. The rest of the file implements the generic commands exposed to the client. @@ -458,9 +468,9 @@ Script The script unit is composed of 3 units: * `script.c` - integration of scripts with Redis (commands execution, set replication/resp, ...) -* `script_lua.c` - responsible to execute Lua code, uses script.c to interact with Redis from within the Lua code. -* `function_lua.c` - contains the Lua engine implementation, uses script_lua.c to execute the Lua code. -* `functions.c` - contains Redis Functions implementation (FUNCTION command), uses functions_lua.c if the function it wants to invoke needs the Lua engine. +* `script_lua.c` - responsible to execute Lua code, uses `script.c` to interact with Redis from within the Lua code. +* `function_lua.c` - contains the Lua engine implementation, uses `script_lua.c` to execute the Lua code. +* `functions.c` - contains Redis Functions implementation (`FUNCTION` command), uses `functions_lua.c` if the function it wants to invoke needs the Lua engine. * `eval.c` - contains the `eval` implementation using `script_lua.c` to invoke the Lua code. diff --git a/REDISCONTRIBUTIONS.txt b/REDISCONTRIBUTIONS.txt new file mode 100644 index 00000000000..9a98f950e80 --- /dev/null +++ b/REDISCONTRIBUTIONS.txt @@ -0,0 +1,30 @@ +Copyright (c) 2006-Present, Redis Ltd. and Contributors +All rights reserved. + +Note: Continued Applicability of the BSD-3-Clause License + +Despite the shift to the dual-licensing model with Redis version 7.4 (RSALv2 or SSPLv1), portions of +Redis remain available subject to the BSD-3-Clause License (BSD). See below for the full BSD +license: + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions +and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions +and the following disclaimer in the documentation and/or other materials provided with the +distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse +or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/SECURITY.md b/SECURITY.md index ea66aaf65e9..5c348319dbe 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -11,17 +11,17 @@ unless this is not possible or feasible with a reasonable effort. | Version | Supported | | ------- | ------------------ | +| 7.2.x | :white_check_mark: | | 7.0.x | :white_check_mark: | | 6.2.x | :white_check_mark: | -| 6.0.x | :white_check_mark: | -| < 6.0 | :x: | +| < 6.2 | :x: | ## Reporting a Vulnerability -If you believe you’ve discovered a serious vulnerability, please contact the +If you believe you've discovered a serious vulnerability, please contact the Redis core team at redis@redis.io. We will evaluate your report and if necessary issue a fix and an advisory. If the issue was previously undisclosed, -we’ll also mention your name in the credits. +we'll also mention your name in the credits. ## Responsible Disclosure @@ -36,7 +36,7 @@ embargo on public disclosure. Vendors on the list are individuals or organizations that maintain Redis distributions or provide Redis as a service, who have third party users who -will benefit from the vendor’s ability to prepare for a new version or deploy a +will benefit from the vendor's ability to prepare for a new version or deploy a fix early. If you believe you should be on the list, please contact us and we will diff --git a/deps/Makefile b/deps/Makefile index c03c79790cb..3bf0363d5c2 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -3,6 +3,7 @@ uname_S:= $(shell sh -c 'uname -s 2>/dev/null || echo not') LUA_DEBUG?=no +LUA_COVERAGE?=no CCCOLOR="\033[34m" LINKCOLOR="\033[34;1m" @@ -85,6 +86,11 @@ ifeq ($(LUA_DEBUG),yes) else LUA_CFLAGS+= -O2 endif +ifeq ($(LUA_COVERAGE),yes) + LUA_CFLAGS += -fprofile-arcs -ftest-coverage + LUA_LDFLAGS += -fprofile-arcs -ftest-coverage +endif + # lua's Makefile defines AR="ar rcu", which is unusual, and makes it more # challenging to cross-compile lua (and redis). These defines make it easier # to fit redis into cross-compilation environments, which typically set AR. diff --git a/deps/README.md b/deps/README.md index 6d434c21f3c..8da051baa79 100644 --- a/deps/README.md +++ b/deps/README.md @@ -63,6 +63,10 @@ Hiredis Hiredis is used by Sentinel, `redis-cli` and `redis-benchmark`. Like Redis, uses the SDS string library, but not necessarily the same version. In order to avoid conflicts, this version has all SDS identifiers prefixed by `hi`. +1. `git subtree pull --prefix deps/hiredis https://github.com/redis/hiredis.git --squash`
+This should hopefully merge the local changes into the new version. +2. Conflicts will arise (due to our changes) you'll need to resolve them and commit. + Linenoise --- diff --git a/deps/hiredis/.github/workflows/build.yml b/deps/hiredis/.github/workflows/build.yml index 1a1ef515318..581800b4f70 100644 --- a/deps/hiredis/.github/workflows/build.yml +++ b/deps/hiredis/.github/workflows/build.yml @@ -133,8 +133,8 @@ jobs: - name: Install dependencies run: | - brew install openssl redis@6.2 - brew link redis@6.2 --force + brew install openssl redis@7.0 + brew link redis@7.0 --force - name: Build hiredis run: USE_SSL=1 make diff --git a/deps/hiredis/.github/workflows/test.yml b/deps/hiredis/.github/workflows/test.yml index 7812af6f74c..1a2c60b795a 100644 --- a/deps/hiredis/.github/workflows/test.yml +++ b/deps/hiredis/.github/workflows/test.yml @@ -60,7 +60,7 @@ jobs: steps: - name: Install qemu if: matrix.emulator - run: sudo apt-get install -y qemu-user + run: sudo apt-get update && sudo apt-get install -y qemu-user - name: Install platform toolset if: matrix.toolset run: sudo apt-get install -y gcc-${{matrix.toolset}} diff --git a/deps/hiredis/CHANGELOG.md b/deps/hiredis/CHANGELOG.md index a2e065b2cd3..801c4072937 100644 --- a/deps/hiredis/CHANGELOG.md +++ b/deps/hiredis/CHANGELOG.md @@ -1,12 +1,63 @@ +## [1.2.0](https://github.com/redis/hiredis/tree/v1.2.0) - (2023-06-04) + +Announcing Hiredis v1.2.0 with with new adapters, and a great many bug fixes. + +## 🚀 New Features + +- Add sdevent adapter @Oipo (#1144) +- Allow specifying the keepalive interval @michael-grunder (#1168) +- Add RedisModule adapter @tezc (#1182) +- Helper for setting TCP_USER_TIMEOUT socket option @zuiderkwast (#1188) + +## 🐛 Bug Fixes + +- Fix a typo in b6a052f. @yossigo (#1190) +- Fix wincrypt symbols conflict @hudayou (#1151) +- Don't attempt to set a timeout if we are in an error state. @michael-grunder (#1180) +- Accept -nan per the RESP3 spec recommendation. @michael-grunder (#1178) +- Fix colliding option values @zuiderkwast (#1172) +- Ensure functionality without `_MSC_VER` definition @windyakin (#1194) + +## 🧰 Maintenance + +- Add a test for the TCP_USER_TIMEOUT option. @michael-grunder (#1192) +- Add -Werror as a default. @yossigo (#1193) +- CI: Update homebrew Redis version. @yossigo (#1191) +- Fix typo in makefile. @michael-grunder (#1179) +- Write a version file for the CMake package @Neverlord (#1165) +- CMakeLists.txt: respect BUILD_SHARED_LIBS @ffontaine (#1147) +- Cmake static or shared @autoantwort (#1160) +- fix typo @tillkruss (#1153) +- Add a test ensuring we don't clobber connection error. @michael-grunder (#1181) +- Search for openssl on macOS @michael-grunder (#1169) + + +## Contributors +We'd like to thank all the contributors who worked on this release! + + + + + + + + + + + + + + + ## [1.1.0](https://github.com/redis/hiredis/tree/v1.1.0) - (2022-11-15) Announcing Hiredis v1.1.0 GA with better SSL convenience, new async adapters and a great many bug fixes. -**NOTE**: Hiredis can now return `nan` in addition to `-inf` and `inf` when returning a `REDIS_REPLY_DOUBLE`. +**NOTE**: Hiredis can now return `nan` in addition to `-inf` and `inf` when returning a `REDIS_REPLY_DOUBLE`. ## 🐛 Bug Fixes -- Add support for nan in RESP3 double [@filipecosta90](https://github.com/filipecosta90) +- Add support for nan in RESP3 double [@filipecosta90](https://github.com/filipecosta90) ([\#1133](https://github.com/redis/hiredis/pull/1133)) ## 🧰 Maintenance @@ -14,7 +65,7 @@ Announcing Hiredis v1.1.0 GA with better SSL convenience, new async adapters and - Add an example that calls redisCommandArgv [@michael-grunder](https://github.com/michael-grunder) ([\#1140](https://github.com/redis/hiredis/pull/1140)) - fix flag reference [@pata00](https://github.com/pata00) ([\#1136](https://github.com/redis/hiredis/pull/1136)) -- Make freeing a NULL redisAsyncContext a no op. [@michael-grunder](https://github.com/michael-grunder) +- Make freeing a NULL redisAsyncContext a no op. [@michael-grunder](https://github.com/michael-grunder) ([\#1135](https://github.com/redis/hiredis/pull/1135)) - CI updates ([@bjosv](https://github.com/redis/bjosv) ([\#1139](https://github.com/redis/hiredis/pull/1139)) diff --git a/deps/hiredis/Makefile b/deps/hiredis/Makefile index f31293e90c2..bd2106b1d12 100644 --- a/deps/hiredis/Makefile +++ b/deps/hiredis/Makefile @@ -39,7 +39,7 @@ export REDIS_TEST_CONFIG CC:=$(shell sh -c 'type $${CC%% *} >/dev/null 2>/dev/null && echo $(CC) || echo gcc') CXX:=$(shell sh -c 'type $${CXX%% *} >/dev/null 2>/dev/null && echo $(CXX) || echo g++') OPTIMIZATION?=-O3 -WARNINGS=-Wall -W -Wstrict-prototypes -Wwrite-strings -Wno-missing-field-initializers +WARNINGS=-Wall -Wextra -Werror -Wstrict-prototypes -Wwrite-strings -Wno-missing-field-initializers DEBUG_FLAGS?= -g -ggdb REAL_CFLAGS=$(OPTIMIZATION) -fPIC $(CPPFLAGS) $(CFLAGS) $(WARNINGS) $(DEBUG_FLAGS) $(PLATFORM_FLAGS) REAL_LDFLAGS=$(LDFLAGS) @@ -311,7 +311,7 @@ install: $(DYLIBNAME) $(STLIBNAME) $(PKGCONFNAME) $(SSL_INSTALL) $(INSTALL) hiredis.h async.h read.h sds.h alloc.h sockcompat.h $(INSTALL_INCLUDE_PATH) $(INSTALL) adapters/*.h $(INSTALL_INCLUDE_PATH)/adapters $(INSTALL) $(DYLIBNAME) $(INSTALL_LIBRARY_PATH)/$(DYLIB_MINOR_NAME) - cd $(INSTALL_LIBRARY_PATH) && ln -sf $(DYLIB_MINOR_NAME) $(DYLIBNAME) + cd $(INSTALL_LIBRARY_PATH) && ln -sf $(DYLIB_MINOR_NAME) $(DYLIBNAME) && ln -sf $(DYLIB_MINOR_NAME) $(DYLIB_MAJOR_NAME) $(INSTALL) $(STLIBNAME) $(INSTALL_LIBRARY_PATH) mkdir -p $(INSTALL_PKGCONF_PATH) $(INSTALL) $(PKGCONFNAME) $(INSTALL_PKGCONF_PATH) @@ -320,7 +320,7 @@ install-ssl: $(SSL_DYLIBNAME) $(SSL_STLIBNAME) $(SSL_PKGCONFNAME) mkdir -p $(INSTALL_INCLUDE_PATH) $(INSTALL_LIBRARY_PATH) $(INSTALL) hiredis_ssl.h $(INSTALL_INCLUDE_PATH) $(INSTALL) $(SSL_DYLIBNAME) $(INSTALL_LIBRARY_PATH)/$(SSL_DYLIB_MINOR_NAME) - cd $(INSTALL_LIBRARY_PATH) && ln -sf $(SSL_DYLIB_MINOR_NAME) $(SSL_DYLIBNAME) + cd $(INSTALL_LIBRARY_PATH) && ln -sf $(SSL_DYLIB_MINOR_NAME) $(SSL_DYLIBNAME) && ln -sf $(SSL_DYLIB_MINOR_NAME) $(SSL_DYLIB_MAJOR_NAME) $(INSTALL) $(SSL_STLIBNAME) $(INSTALL_LIBRARY_PATH) mkdir -p $(INSTALL_PKGCONF_PATH) $(INSTALL) $(SSL_PKGCONFNAME) $(INSTALL_PKGCONF_PATH) diff --git a/deps/hiredis/hiredis.c b/deps/hiredis/hiredis.c index fd200173f2d..8012035a05c 100644 --- a/deps/hiredis/hiredis.c +++ b/deps/hiredis/hiredis.c @@ -392,12 +392,12 @@ int redisvFormatCommand(char **target, const char *format, va_list ap) { while (*_p != '\0' && strchr(flags,*_p) != NULL) _p++; /* Field width */ - while (*_p != '\0' && isdigit(*_p)) _p++; + while (*_p != '\0' && isdigit((int) *_p)) _p++; /* Precision */ if (*_p == '.') { _p++; - while (*_p != '\0' && isdigit(*_p)) _p++; + while (*_p != '\0' && isdigit((int) *_p)) _p++; } /* Copy va_list before consuming with va_arg */ diff --git a/deps/hiredis/hiredis.h b/deps/hiredis/hiredis.h index 044a344e00e..635988b7e1c 100644 --- a/deps/hiredis/hiredis.h +++ b/deps/hiredis/hiredis.h @@ -46,9 +46,9 @@ typedef long long ssize_t; #include "alloc.h" /* for allocation wrappers */ #define HIREDIS_MAJOR 1 -#define HIREDIS_MINOR 1 -#define HIREDIS_PATCH 1 -#define HIREDIS_SONAME 1.1.1-dev +#define HIREDIS_MINOR 2 +#define HIREDIS_PATCH 0 +#define HIREDIS_SONAME 1.1.0 /* Connection type can be blocking or non-blocking and is set in the * least significant bit of the flags field in redisContext. */ diff --git a/deps/hiredis/net.c b/deps/hiredis/net.c index d75a966580d..33fe0b94f4a 100644 --- a/deps/hiredis/net.c +++ b/deps/hiredis/net.c @@ -234,6 +234,7 @@ int redisContextSetTcpUserTimeout(redisContext *c, unsigned int timeout) { res = setsockopt(c->fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &timeout, sizeof(timeout)); #else res = -1; + errno = ENOTSUP; (void)timeout; #endif if (res == -1) { diff --git a/deps/hiredis/sds.c b/deps/hiredis/sds.c index f99962eb668..ac2b483525d 100644 --- a/deps/hiredis/sds.c +++ b/deps/hiredis/sds.c @@ -948,7 +948,7 @@ hisds *hi_sdssplitargs(const char *line, int *argc) { *argc = 0; while(1) { /* skip blanks */ - while(*p && isspace(*p)) p++; + while(*p && isspace((int) *p)) p++; if (*p) { /* get a token */ int inq=0; /* set to 1 if we are in "quotes" */ @@ -959,8 +959,8 @@ hisds *hi_sdssplitargs(const char *line, int *argc) { while(!done) { if (inq) { if (*p == '\\' && *(p+1) == 'x' && - isxdigit(*(p+2)) && - isxdigit(*(p+3))) + isxdigit((int) *(p+2)) && + isxdigit((int) *(p+3))) { unsigned char byte; @@ -984,7 +984,7 @@ hisds *hi_sdssplitargs(const char *line, int *argc) { } else if (*p == '"') { /* closing quote must be followed by a space or * nothing at all. */ - if (*(p+1) && !isspace(*(p+1))) goto err; + if (*(p+1) && !isspace((int) *(p+1))) goto err; done=1; } else if (!*p) { /* unterminated quotes */ @@ -999,7 +999,7 @@ hisds *hi_sdssplitargs(const char *line, int *argc) { } else if (*p == '\'') { /* closing quote must be followed by a space or * nothing at all. */ - if (*(p+1) && !isspace(*(p+1))) goto err; + if (*(p+1) && !isspace((int) *(p+1))) goto err; done=1; } else if (!*p) { /* unterminated quotes */ diff --git a/deps/hiredis/ssl.c b/deps/hiredis/ssl.c index 1431803ad96..9ab18cc0e52 100644 --- a/deps/hiredis/ssl.c +++ b/deps/hiredis/ssl.c @@ -59,6 +59,8 @@ #include "async_private.h" #include "hiredis_ssl.h" +#define OPENSSL_1_1_0 0x10100000L + void __redisSetError(redisContext *c, int type, const char *str); struct redisSSLContext { @@ -100,7 +102,7 @@ redisContextFuncs redisContextSSLFuncs; * Note that this is only required for OpenSSL < 1.1.0. */ -#if OPENSSL_VERSION_NUMBER < 0x10100000L +#if OPENSSL_VERSION_NUMBER < OPENSSL_1_1_0 #define HIREDIS_USE_CRYPTO_LOCKS #endif @@ -256,13 +258,25 @@ redisSSLContext *redisCreateSSLContextWithOptions(redisSSLOptions *options, redi if (ctx == NULL) goto error; - ctx->ssl_ctx = SSL_CTX_new(SSLv23_client_method()); + const SSL_METHOD *ssl_method; +#if OPENSSL_VERSION_NUMBER >= OPENSSL_1_1_0 + ssl_method = TLS_client_method(); +#else + ssl_method = SSLv23_client_method(); +#endif + + ctx->ssl_ctx = SSL_CTX_new(ssl_method); if (!ctx->ssl_ctx) { if (error) *error = REDIS_SSL_CTX_CREATE_FAILED; goto error; } - SSL_CTX_set_options(ctx->ssl_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3); +#if OPENSSL_VERSION_NUMBER >= OPENSSL_1_1_0 + SSL_CTX_set_min_proto_version(ctx->ssl_ctx, TLS1_2_VERSION); +#else + SSL_CTX_set_options(ctx->ssl_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 | SSL_OP_NO_TLSv1 | SSL_OP_NO_TLSv1_1); +#endif + SSL_CTX_set_verify(ctx->ssl_ctx, options->verify_mode, NULL); if ((cert_filename != NULL && private_key_filename == NULL) || diff --git a/deps/hiredis/test.c b/deps/hiredis/test.c index ecfe075b7d1..f47e9ef2a2a 100644 --- a/deps/hiredis/test.c +++ b/deps/hiredis/test.c @@ -78,7 +78,7 @@ static int tests = 0, fails = 0, skips = 0; static void millisleep(int ms) { -#if _MSC_VER +#ifdef _MSC_VER Sleep(ms); #else usleep(ms*1000); @@ -409,10 +409,19 @@ static void test_tcp_options(struct config cfg) { redisContext *c; c = do_connect(cfg); + test("We can enable TCP_KEEPALIVE: "); test_cond(redisEnableKeepAlive(c) == REDIS_OK); - disconnect(c, 0); +#ifdef TCP_USER_TIMEOUT + test("We can set TCP_USER_TIMEOUT: "); + test_cond(redisSetTcpUserTimeout(c, 100) == REDIS_OK); +#else + test("Setting TCP_USER_TIMEOUT errors when unsupported: "); + test_cond(redisSetTcpUserTimeout(c, 100) == REDIS_ERR && c->err == REDIS_ERR_IO); +#endif + + redisFree(c); } static void test_reply_reader(void) { @@ -1567,6 +1576,9 @@ static void test_throughput(struct config config) { // } #ifdef HIREDIS_TEST_ASYNC + +#pragma GCC diagnostic ignored "-Woverlength-strings" /* required on gcc 4.8.x due to assert statements */ + struct event_base *base; typedef struct TestState { diff --git a/deps/linenoise/README.markdown b/deps/linenoise/README.markdown index 1afea2ae65c..b3752da162b 100644 --- a/deps/linenoise/README.markdown +++ b/deps/linenoise/README.markdown @@ -108,7 +108,7 @@ to search and re-edit already inserted lines of text. The followings are the history API calls: - int linenoiseHistoryAdd(const char *line); + int linenoiseHistoryAdd(const char *line, int is_sensitive); int linenoiseHistorySetMaxLen(int len); int linenoiseHistorySave(const char *filename); int linenoiseHistoryLoad(const char *filename); diff --git a/deps/linenoise/linenoise.c b/deps/linenoise/linenoise.c index dd86abe86e2..30d19a9152b 100644 --- a/deps/linenoise/linenoise.c +++ b/deps/linenoise/linenoise.c @@ -117,6 +117,7 @@ #include #include #include +#include #include "linenoise.h" #define LINENOISE_DEFAULT_HISTORY_MAX_LEN 100 @@ -134,6 +135,18 @@ static int atexit_registered = 0; /* Register atexit just 1 time. */ static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN; static int history_len = 0; static char **history = NULL; +static int *history_sensitive = NULL; /* An array records whether each line in + * history is sensitive. */ + +static int reverse_search_mode_enabled = 0; +static int reverse_search_direction = 0; /* 1 means forward, -1 means backward. */ +static int cycle_to_next_search = 0; /* indicates whether to continue the search with CTRL+S or CTRL+R. */ +static char search_result[LINENOISE_MAX_LINE]; +static char search_result_friendly[LINENOISE_MAX_LINE]; +static int search_result_history_index = 0; +static int search_result_start_offset = 0; +static int ignore_once_hint = 0; /* Flag to ignore hint once, preventing it from interfering + * with search results right after exiting search mode. */ /* The linenoiseState structure represents the state during line editing. * We pass this state to functions implementing specific editing @@ -143,6 +156,7 @@ struct linenoiseState { int ofd; /* Terminal stdout file descriptor. */ char *buf; /* Edited line buffer. */ size_t buflen; /* Edited line buffer size. */ + const char *origin_prompt; /* Original prompt, used to restore when exiting search mode. */ const char *prompt; /* Prompt to display. */ size_t plen; /* Prompt length. */ size_t pos; /* Current cursor position. */ @@ -153,6 +167,13 @@ struct linenoiseState { int history_index; /* The history index we are currently editing. */ }; +typedef struct { + int len; /* Length of the result string. */ + char *result; /* Search result string. */ + int search_term_index; /* Position of the search term in the history record. */ + int search_term_len; /* Length of the search term. */ +} linenoiseHistorySearchResult; + enum KEY_ACTION{ KEY_NULL = 0, /* NULL */ CTRL_A = 1, /* Ctrl+a */ @@ -161,6 +182,7 @@ enum KEY_ACTION{ CTRL_D = 4, /* Ctrl-d */ CTRL_E = 5, /* Ctrl-e */ CTRL_F = 6, /* Ctrl-f */ + CTRL_G = 7, /* Ctrl-g */ CTRL_H = 8, /* Ctrl-h */ TAB = 9, /* Tab */ NL = 10, /* Enter typed before raw mode was enabled */ @@ -169,6 +191,8 @@ enum KEY_ACTION{ ENTER = 13, /* Enter */ CTRL_N = 14, /* Ctrl-n */ CTRL_P = 16, /* Ctrl-p */ + CTRL_R = 18, /* Ctrl-r */ + CTRL_S = 19, /* Ctrl-s */ CTRL_T = 20, /* Ctrl-t */ CTRL_U = 21, /* Ctrl+u */ CTRL_W = 23, /* Ctrl+w */ @@ -177,8 +201,14 @@ enum KEY_ACTION{ }; static void linenoiseAtExit(void); -int linenoiseHistoryAdd(const char *line); +int linenoiseHistoryAdd(const char *line, int is_sensitive); static void refreshLine(struct linenoiseState *l); +static void refreshSearchResult(struct linenoiseState *ls); + +static inline void resetSearchResult(void) { + memset(search_result, 0, sizeof(search_result)); + memset(search_result_friendly, 0, sizeof(search_result_friendly)); +} /* Debugging macro. */ #if 0 @@ -219,6 +249,41 @@ void linenoiseSetMultiLine(int ml) { mlmode = ml; } +#define REVERSE_SEARCH_PROMPT(direction) ((direction) == -1 ? "(reverse-i-search): " : "(i-search): ") + +/* Enables the reverse search mode and refreshes the prompt. */ +static void enableReverseSearchMode(struct linenoiseState *l) { + assert(reverse_search_mode_enabled != 1); + reverse_search_mode_enabled = 1; + l->origin_prompt = l->prompt; + l->prompt = REVERSE_SEARCH_PROMPT(reverse_search_direction); + refreshLine(l); +} + +/* This function disables the reverse search mode and returns the terminal to its original state. + * If the 'discard' parameter is true, it discards the user's input search keyword and search result. + * Otherwise, it copies the search result into 'buf', If there is no search result, it copies the + * input search keyword instead. */ +static void disableReverseSearchMode(struct linenoiseState *l, char *buf, size_t buflen, int discard) { + if (discard) { + buf[0] = '\0'; + l->pos = l->len = 0; + } else { + ignore_once_hint = 1; + if (strlen(search_result)) { + strncpy(buf, search_result, buflen); + buf[buflen-1] = '\0'; + l->pos = l->len = strlen(buf); + } + } + + /* Reset the state to non-search state. */ + reverse_search_mode_enabled = 0; + l->prompt = l->origin_prompt; + resetSearchResult(); + refreshLine(l); +} + /* Return true if the terminal name is in the list of terminals we know are * not able to understand basic escape sequences. */ static int isUnsupportedTerm(void) { @@ -233,6 +298,10 @@ static int isUnsupportedTerm(void) { /* Raw mode: 1960 magic shit. */ static int enableRawMode(int fd) { + if (getenv("FAKETTY_WITH_PROMPT") != NULL) { + return 0; + } + struct termios raw; if (!isatty(STDIN_FILENO)) goto fatal; @@ -301,6 +370,9 @@ static int getCursorPosition(int ifd, int ofd) { /* Try to get the number of columns in the current terminal, or assume 80 * if it fails. */ static int getColumns(int ifd, int ofd) { + if (getenv("FAKETTY_WITH_PROMPT") != NULL) { + goto failed; + } struct winsize ws; if (ioctl(1, TIOCGWINSZ, &ws) == -1 || ws.ws_col == 0) { @@ -492,6 +564,13 @@ static void abFree(struct abuf *ab) { * to the right of the prompt. */ void refreshShowHints(struct abuf *ab, struct linenoiseState *l, int plen) { char seq[64]; + + /* Show hits when not in reverse search mode and not instructed to ignore once. */ + if (reverse_search_mode_enabled || ignore_once_hint) { + ignore_once_hint = 0; + return; + } + if (hintsCallback && plen+l->len < l->cols) { int color = -1, bold = 0; char *hint = hintsCallback(l->buf,&color,&bold); @@ -604,7 +683,12 @@ static void refreshMultiLine(struct linenoiseState *l) { unsigned int i; for (i = 0; i < l->len; i++) abAppend(&ab,"*",1); } else { - abAppend(&ab,l->buf,l->len); + refreshSearchResult(l); + if (strlen(search_result) > 0) { + abAppend(&ab, search_result_friendly, strlen(search_result_friendly)); + } else { + abAppend(&ab,l->buf,l->len); + } } /* Show hits if any. */ @@ -637,6 +721,9 @@ static void refreshMultiLine(struct linenoiseState *l) { /* Set column. */ col = (plen+(int)l->pos) % (int)l->cols; + if (strlen(search_result) > 0) { + col += search_result_start_offset; + } lndebug("set col %d", 1+col); if (col) snprintf(seq,64,"\r\x1b[%dC", col); @@ -818,7 +905,7 @@ static int linenoiseEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, /* The latest history entry is always our current buffer, that * initially is just an empty string. */ - linenoiseHistoryAdd(""); + linenoiseHistoryAdd("", 0); if (write(l.ofd,prompt,l.plen) == -1) return -1; while(1) { @@ -832,7 +919,7 @@ static int linenoiseEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, /* Only autocomplete when the callback is set. It returns < 0 when * there was an error reading from fd. Otherwise it will return the * character that should be handled next. */ - if (c == 9 && completionCallback != NULL) { + if (c == TAB && completionCallback != NULL && !reverse_search_mode_enabled) { c = completeLine(&l); /* Return on errors */ if (c < 0) return l.len; @@ -843,6 +930,9 @@ static int linenoiseEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, switch(c) { case NL: /* enter, typed before raw mode was enabled */ break; + case TAB: + if (reverse_search_mode_enabled) disableReverseSearchMode(&l, buf, buflen, 0); + break; case ENTER: /* enter */ history_len--; free(history[history_len]); @@ -855,8 +945,14 @@ static int linenoiseEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, refreshLine(&l); hintsCallback = hc; } + + if (reverse_search_mode_enabled) disableReverseSearchMode(&l, buf, buflen, 0); return (int)l.len; case CTRL_C: /* ctrl-c */ + if (reverse_search_mode_enabled) { + disableReverseSearchMode(&l, buf, buflen, 1); + break; + } errno = EAGAIN; return -1; case BACKSPACE: /* backspace */ @@ -891,6 +987,23 @@ static int linenoiseEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, case CTRL_P: /* ctrl-p */ linenoiseEditHistoryNext(&l, LINENOISE_HISTORY_PREV); break; + case CTRL_R: + case CTRL_S: + reverse_search_direction = c == CTRL_R ? -1 : 1; + if (reverse_search_mode_enabled) { + /* cycle search results */ + cycle_to_next_search = 1; + l.prompt = REVERSE_SEARCH_PROMPT(reverse_search_direction); + refreshLine(&l); + break; + } + buf[0] = '\0'; + l.pos = l.len = 0; + enableReverseSearchMode(&l); + break; + case CTRL_G: + if (reverse_search_mode_enabled) disableReverseSearchMode(&l, buf, buflen, 1); + break; case CTRL_N: /* ctrl-n */ linenoiseEditHistoryNext(&l, LINENOISE_HISTORY_NEXT); break; @@ -901,6 +1014,11 @@ static int linenoiseEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, if (read(l.ifd,seq,1) == -1) break; if (read(l.ifd,seq+1,1) == -1) break; + if (reverse_search_mode_enabled) { + disableReverseSearchMode(&l, buf, buflen, 1); + break; + } + /* ESC [ sequences. */ if (seq[0] == '[') { if (seq[1] >= '0' && seq[1] <= '9') { @@ -1067,14 +1185,14 @@ static char *linenoiseNoTTY(void) { * editing function or uses dummy fgets() so that you will be able to type * something even in the most desperate of the conditions. */ char *linenoise(const char *prompt) { - char buf[LINENOISE_MAX_LINE]; + char buf[LINENOISE_MAX_LINE] = {0}; int count; - if (!isatty(STDIN_FILENO)) { + if (getenv("FAKETTY_WITH_PROMPT") == NULL && !isatty(STDIN_FILENO)) { /* Not a tty: read from file / pipe. In this mode we don't want any * limit to the line size, so we call a function to handle that. */ return linenoiseNoTTY(); - } else if (isUnsupportedTerm()) { + } else if (getenv("FAKETTY_WITH_PROMPT") == NULL && isUnsupportedTerm()) { size_t len; printf("%s",prompt); @@ -1112,6 +1230,7 @@ static void freeHistory(void) { for (j = 0; j < history_len; j++) free(history[j]); free(history); + free(history_sensitive); } } @@ -1128,7 +1247,7 @@ static void linenoiseAtExit(void) { * histories, but will work well for a few hundred of entries. * * Using a circular buffer is smarter, but a bit more complex to handle. */ -int linenoiseHistoryAdd(const char *line) { +int linenoiseHistoryAdd(const char *line, int is_sensitive) { char *linecopy; if (history_max_len == 0) return 0; @@ -1137,7 +1256,14 @@ int linenoiseHistoryAdd(const char *line) { if (history == NULL) { history = malloc(sizeof(char*)*history_max_len); if (history == NULL) return 0; + history_sensitive = malloc(sizeof(int)*history_max_len); + if (history_sensitive == NULL) { + free(history); + history = NULL; + return 0; + } memset(history,0,(sizeof(char*)*history_max_len)); + memset(history_sensitive,0,(sizeof(int)*history_max_len)); } /* Don't add duplicated lines. */ @@ -1150,9 +1276,11 @@ int linenoiseHistoryAdd(const char *line) { if (history_len == history_max_len) { free(history[0]); memmove(history,history+1,sizeof(char*)*(history_max_len-1)); + memmove(history_sensitive,history_sensitive+1,sizeof(int)*(history_max_len-1)); history_len--; } history[history_len] = linecopy; + history_sensitive[history_len] = is_sensitive; history_len++; return 1; } @@ -1163,6 +1291,7 @@ int linenoiseHistoryAdd(const char *line) { * than the amount of items already inside the history. */ int linenoiseHistorySetMaxLen(int len) { char **new; + int *new_sensitive; if (len < 1) return 0; if (history) { @@ -1170,6 +1299,11 @@ int linenoiseHistorySetMaxLen(int len) { new = malloc(sizeof(char*)*len); if (new == NULL) return 0; + new_sensitive = malloc(sizeof(int)*len); + if (new_sensitive == NULL) { + free(new); + return 0; + } /* If we can't copy everything, free the elements we'll not use. */ if (len < tocopy) { @@ -1179,9 +1313,13 @@ int linenoiseHistorySetMaxLen(int len) { tocopy = len; } memset(new,0,sizeof(char*)*len); + memset(new_sensitive,0,sizeof(int)*len); memcpy(new,history+(history_len-tocopy), sizeof(char*)*tocopy); + memcpy(new_sensitive,history_sensitive+(history_len-tocopy), sizeof(int)*tocopy); free(history); + free(history_sensitive); history = new; + history_sensitive = new_sensitive; } history_max_len = len; if (history_len > history_max_len) @@ -1201,7 +1339,7 @@ int linenoiseHistorySave(const char *filename) { if (fp == NULL) return -1; fchmod(fileno(fp),S_IRUSR|S_IWUSR); for (j = 0; j < history_len; j++) - fprintf(fp,"%s\n",history[j]); + if (!history_sensitive[j]) fprintf(fp,"%s\n",history[j]); fclose(fp); return 0; } @@ -1223,8 +1361,97 @@ int linenoiseHistoryLoad(const char *filename) { p = strchr(buf,'\r'); if (!p) p = strchr(buf,'\n'); if (p) *p = '\0'; - linenoiseHistoryAdd(buf); + linenoiseHistoryAdd(buf, 0); } fclose(fp); return 0; } + +/* This function updates the search index based on the direction of the search. + * Returns 0 if the beginning or end of the history is reached, otherwise, returns 1. */ +static int setNextSearchIndex(int *i) { + if (reverse_search_direction == 1) { + if (*i == history_len-1) return 0; + *i = *i + 1; + } else { + if (*i <= 0) return 0; + *i = *i - 1; + } + return 1; +} + +linenoiseHistorySearchResult searchInHistory(char *search_term) { + linenoiseHistorySearchResult result = {0}; + + if (!history_len || !strlen(search_term)) return result; + + int i = cycle_to_next_search ? search_result_history_index : + (reverse_search_direction == -1 ? history_len-1 : 0); + + while (1) { + char *found = strstr(history[i], search_term); + + /* check if we found the same string at another index when cycling, this would be annoying to cycle through + * as it might appear that cycling isn't working */ + int strings_are_the_same = cycle_to_next_search && strcmp(history[i], history[search_result_history_index]) == 0; + + if (found && !strings_are_the_same) { + int haystack_index = found - history[i]; + result.result = history[i]; + result.len = strlen(history[i]); + result.search_term_index = haystack_index; + result.search_term_len = strlen(search_term); + search_result_history_index = i; + break; + } + + /* Exit if reached the end. */ + if (!setNextSearchIndex(&i)) break; + } + + return result; +} + +static void refreshSearchResult(struct linenoiseState *ls) { + if (!reverse_search_mode_enabled) { + return; + } + + linenoiseHistorySearchResult sr = searchInHistory(ls->buf); + int found = sr.result && sr.len; + + /* If the search term has not changed and we are cycling to the next search result + * (using CTRL+R or CTRL+S), there is no need to reset the old search result. */ + if (!cycle_to_next_search || found) + resetSearchResult(); + cycle_to_next_search = 0; + + if (found) { + char *bold = "\x1B[1m"; + char *normal = "\x1B[0m"; + + int size_needed = sr.search_term_index + sr.search_term_len + sr.len - + (sr.search_term_index+sr.search_term_len) + sizeof(normal) + sizeof(bold) + sizeof(normal); + if (size_needed > sizeof(search_result_friendly) - 1) { + return; + } + + /* Allocate memory for the prefix, match, and suffix strings, one extra byte for `\0`. */ + char *prefix = calloc(sizeof(char), sr.search_term_index + 1); + char *match = calloc(sizeof(char), sr.search_term_len + 1); + char *suffix = calloc(sizeof(char), sr.len - (sr.search_term_index+sr.search_term_len) + 1); + + memcpy(prefix, sr.result, sr.search_term_index); + memcpy(match, sr.result + sr.search_term_index, sr.search_term_len); + memcpy(suffix, sr.result + sr.search_term_index + sr.search_term_len, + sr.len - (sr.search_term_index+sr.search_term_len)); + sprintf(search_result, "%s%s%s", prefix, match, suffix); + sprintf(search_result_friendly, "%s%s%s%s%s%s", normal, prefix, bold, match, normal, suffix); + + free(prefix); + free(match); + free(suffix); + + search_result_start_offset = sr.search_term_index; + } +} diff --git a/deps/linenoise/linenoise.h b/deps/linenoise/linenoise.h index 6dfee73bcd4..beac6df467a 100644 --- a/deps/linenoise/linenoise.h +++ b/deps/linenoise/linenoise.h @@ -58,7 +58,7 @@ void linenoiseAddCompletion(linenoiseCompletions *, const char *); char *linenoise(const char *prompt); void linenoiseFree(void *ptr); -int linenoiseHistoryAdd(const char *line); +int linenoiseHistoryAdd(const char *line, int is_sensitive); int linenoiseHistorySetMaxLen(int len); int linenoiseHistorySave(const char *filename); int linenoiseHistoryLoad(const char *filename); diff --git a/deps/lua/src/loslib.c b/deps/lua/src/loslib.c index da06a572acf..403f41634a1 100644 --- a/deps/lua/src/loslib.c +++ b/deps/lua/src/loslib.c @@ -234,10 +234,17 @@ static const luaL_Reg syslib[] = { /* }====================================================== */ +#define UNUSED(V) ((void) V) +/* Only a subset is loaded currently, for sandboxing concerns. */ +static const luaL_Reg sandbox_syslib[] = { + {"clock", os_clock}, + {NULL, NULL} +}; LUALIB_API int luaopen_os (lua_State *L) { - luaL_register(L, LUA_OSLIBNAME, syslib); + UNUSED(syslib); + luaL_register(L, LUA_OSLIBNAME, sandbox_syslib); return 1; } diff --git a/deps/lua/src/lua_cjson.c b/deps/lua/src/lua_cjson.c index c26c0d7b8ea..b86d73e97cf 100644 --- a/deps/lua/src/lua_cjson.c +++ b/deps/lua/src/lua_cjson.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include "lua.h" #include "lauxlib.h" @@ -141,13 +142,13 @@ typedef struct { typedef struct { json_token_type_t type; - int index; + size_t index; union { const char *string; double number; int boolean; } value; - int string_len; + size_t string_len; } json_token_t; static const char *char2escape[256] = { @@ -463,9 +464,8 @@ static void json_encode_exception(lua_State *l, json_config_t *cfg, strbuf_t *js static void json_append_string(lua_State *l, strbuf_t *json, int lindex) { const char *escstr; - int i; const char *str; - size_t len; + size_t i, len; str = lua_tolstring(l, lindex, &len); @@ -473,6 +473,8 @@ static void json_append_string(lua_State *l, strbuf_t *json, int lindex) * This buffer is reused constantly for small strings * If there are any excess pages, they won't be hit anyway. * This gains ~5% speedup. */ + if (len > SIZE_MAX / 6 - 3) + abort(); /* Overflow check */ strbuf_ensure_empty_length(json, len * 6 + 2); strbuf_append_char_unsafe(json, '\"'); @@ -706,7 +708,7 @@ static int json_encode(lua_State *l) strbuf_t local_encode_buf; strbuf_t *encode_buf; char *json; - int len; + size_t len; luaL_argcheck(l, lua_gettop(l) == 1, 1, "expected 1 argument"); diff --git a/deps/lua/src/lua_cmsgpack.c b/deps/lua/src/lua_cmsgpack.c index 49c6dc7b0ce..5f8929d454d 100644 --- a/deps/lua/src/lua_cmsgpack.c +++ b/deps/lua/src/lua_cmsgpack.c @@ -117,7 +117,9 @@ mp_buf *mp_buf_new(lua_State *L) { void mp_buf_append(lua_State *L, mp_buf *buf, const unsigned char *s, size_t len) { if (buf->free < len) { - size_t newsize = (buf->len+len)*2; + size_t newsize = buf->len+len; + if (newsize < buf->len || newsize >= SIZE_MAX/2) abort(); + newsize *= 2; buf->b = (unsigned char*)mp_realloc(L, buf->b, buf->len + buf->free, newsize); buf->free = newsize - buf->len; @@ -173,7 +175,7 @@ void mp_cur_init(mp_cur *cursor, const unsigned char *s, size_t len) { void mp_encode_bytes(lua_State *L, mp_buf *buf, const unsigned char *s, size_t len) { unsigned char hdr[5]; - int hdrlen; + size_t hdrlen; if (len < 32) { hdr[0] = 0xa0 | (len&0xff); /* fix raw */ @@ -220,7 +222,7 @@ void mp_encode_double(lua_State *L, mp_buf *buf, double d) { void mp_encode_int(lua_State *L, mp_buf *buf, int64_t n) { unsigned char b[9]; - int enclen; + size_t enclen; if (n >= 0) { if (n <= 127) { @@ -290,9 +292,9 @@ void mp_encode_int(lua_State *L, mp_buf *buf, int64_t n) { mp_buf_append(L,buf,b,enclen); } -void mp_encode_array(lua_State *L, mp_buf *buf, int64_t n) { +void mp_encode_array(lua_State *L, mp_buf *buf, uint64_t n) { unsigned char b[5]; - int enclen; + size_t enclen; if (n <= 15) { b[0] = 0x90 | (n & 0xf); /* fix array */ @@ -313,7 +315,7 @@ void mp_encode_array(lua_State *L, mp_buf *buf, int64_t n) { mp_buf_append(L,buf,b,enclen); } -void mp_encode_map(lua_State *L, mp_buf *buf, int64_t n) { +void mp_encode_map(lua_State *L, mp_buf *buf, uint64_t n) { unsigned char b[5]; int enclen; @@ -791,7 +793,7 @@ void mp_decode_to_lua_type(lua_State *L, mp_cur *c) { } } -int mp_unpack_full(lua_State *L, int limit, int offset) { +int mp_unpack_full(lua_State *L, lua_Integer limit, lua_Integer offset) { size_t len; const char *s; mp_cur c; @@ -803,10 +805,10 @@ int mp_unpack_full(lua_State *L, int limit, int offset) { if (offset < 0 || limit < 0) /* requesting negative off or lim is invalid */ return luaL_error(L, "Invalid request to unpack with offset of %d and limit of %d.", - offset, len); + (int) offset, (int) len); else if (offset > len) return luaL_error(L, - "Start offset %d greater than input length %d.", offset, len); + "Start offset %d greater than input length %d.", (int) offset, (int) len); if (decode_all) limit = INT_MAX; @@ -828,12 +830,13 @@ int mp_unpack_full(lua_State *L, int limit, int offset) { /* c->left is the remaining size of the input buffer. * subtract the entire buffer size from the unprocessed size * to get our next start offset */ - int offset = len - c.left; + size_t new_offset = len - c.left; + if (new_offset > LONG_MAX) abort(); luaL_checkstack(L, 1, "in function mp_unpack_full"); /* Return offset -1 when we have have processed the entire buffer. */ - lua_pushinteger(L, c.left == 0 ? -1 : offset); + lua_pushinteger(L, c.left == 0 ? -1 : (lua_Integer) new_offset); /* Results are returned with the arg elements still * in place. Lua takes care of only returning * elements above the args for us. @@ -852,15 +855,15 @@ int mp_unpack(lua_State *L) { } int mp_unpack_one(lua_State *L) { - int offset = luaL_optinteger(L, 2, 0); + lua_Integer offset = luaL_optinteger(L, 2, 0); /* Variable pop because offset may not exist */ lua_pop(L, lua_gettop(L)-1); return mp_unpack_full(L, 1, offset); } int mp_unpack_limit(lua_State *L) { - int limit = luaL_checkinteger(L, 2); - int offset = luaL_optinteger(L, 3, 0); + lua_Integer limit = luaL_checkinteger(L, 2); + lua_Integer offset = luaL_optinteger(L, 3, 0); /* Variable pop because offset may not exist */ lua_pop(L, lua_gettop(L)-1); diff --git a/deps/lua/src/strbuf.c b/deps/lua/src/strbuf.c index f0f7f4b9a36..97ee940c900 100644 --- a/deps/lua/src/strbuf.c +++ b/deps/lua/src/strbuf.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "strbuf.h" @@ -38,22 +39,22 @@ static void die(const char *fmt, ...) va_end(arg); fprintf(stderr, "\n"); - exit(-1); + abort(); } -void strbuf_init(strbuf_t *s, int len) +void strbuf_init(strbuf_t *s, size_t len) { - int size; + size_t size; - if (len <= 0) + if (!len) size = STRBUF_DEFAULT_SIZE; else - size = len + 1; /* \0 terminator */ - + size = len + 1; + if (size < len) + die("Overflow, len: %zu", len); s->buf = NULL; s->size = size; s->length = 0; - s->increment = STRBUF_DEFAULT_INCREMENT; s->dynamic = 0; s->reallocs = 0; s->debug = 0; @@ -65,7 +66,7 @@ void strbuf_init(strbuf_t *s, int len) strbuf_ensure_null(s); } -strbuf_t *strbuf_new(int len) +strbuf_t *strbuf_new(size_t len) { strbuf_t *s; @@ -81,20 +82,10 @@ strbuf_t *strbuf_new(int len) return s; } -void strbuf_set_increment(strbuf_t *s, int increment) -{ - /* Increment > 0: Linear buffer growth rate - * Increment < -1: Exponential buffer growth rate */ - if (increment == 0 || increment == -1) - die("BUG: Invalid string increment"); - - s->increment = increment; -} - static inline void debug_stats(strbuf_t *s) { if (s->debug) { - fprintf(stderr, "strbuf(%lx) reallocs: %d, length: %d, size: %d\n", + fprintf(stderr, "strbuf(%lx) reallocs: %d, length: %zd, size: %zd\n", (long)s, s->reallocs, s->length, s->size); } } @@ -113,7 +104,7 @@ void strbuf_free(strbuf_t *s) free(s); } -char *strbuf_free_to_string(strbuf_t *s, int *len) +char *strbuf_free_to_string(strbuf_t *s, size_t *len) { char *buf; @@ -131,57 +122,61 @@ char *strbuf_free_to_string(strbuf_t *s, int *len) return buf; } -static int calculate_new_size(strbuf_t *s, int len) +static size_t calculate_new_size(strbuf_t *s, size_t len) { - int reqsize, newsize; + size_t reqsize, newsize; if (len <= 0) die("BUG: Invalid strbuf length requested"); /* Ensure there is room for optional NULL termination */ reqsize = len + 1; + if (reqsize < len) + die("Overflow, len: %zu", len); /* If the user has requested to shrink the buffer, do it exactly */ if (s->size > reqsize) return reqsize; newsize = s->size; - if (s->increment < 0) { + if (reqsize >= SIZE_MAX / 2) { + newsize = reqsize; + } else { /* Exponential sizing */ while (newsize < reqsize) - newsize *= -s->increment; - } else { - /* Linear sizing */ - newsize = ((newsize + s->increment - 1) / s->increment) * s->increment; + newsize *= 2; } + if (newsize < reqsize) + die("BUG: strbuf length would overflow, len: %zu", len); + return newsize; } /* Ensure strbuf can handle a string length bytes long (ignoring NULL * optional termination). */ -void strbuf_resize(strbuf_t *s, int len) +void strbuf_resize(strbuf_t *s, size_t len) { - int newsize; + size_t newsize; newsize = calculate_new_size(s, len); if (s->debug > 1) { - fprintf(stderr, "strbuf(%lx) resize: %d => %d\n", + fprintf(stderr, "strbuf(%lx) resize: %zd => %zd\n", (long)s, s->size, newsize); } s->size = newsize; s->buf = realloc(s->buf, s->size); if (!s->buf) - die("Out of memory"); + die("Out of memory, len: %zu", len); s->reallocs++; } void strbuf_append_string(strbuf_t *s, const char *str) { - int space, i; + size_t i, space; space = strbuf_empty_length(s); @@ -197,55 +192,6 @@ void strbuf_append_string(strbuf_t *s, const char *str) } } -/* strbuf_append_fmt() should only be used when an upper bound - * is known for the output string. */ -void strbuf_append_fmt(strbuf_t *s, int len, const char *fmt, ...) -{ - va_list arg; - int fmt_len; - - strbuf_ensure_empty_length(s, len); - - va_start(arg, fmt); - fmt_len = vsnprintf(s->buf + s->length, len, fmt, arg); - va_end(arg); - - if (fmt_len < 0) - die("BUG: Unable to convert number"); /* This should never happen.. */ - - s->length += fmt_len; -} - -/* strbuf_append_fmt_retry() can be used when the there is no known - * upper bound for the output string. */ -void strbuf_append_fmt_retry(strbuf_t *s, const char *fmt, ...) -{ - va_list arg; - int fmt_len, try; - int empty_len; - - /* If the first attempt to append fails, resize the buffer appropriately - * and try again */ - for (try = 0; ; try++) { - va_start(arg, fmt); - /* Append the new formatted string */ - /* fmt_len is the length of the string required, excluding the - * trailing NULL */ - empty_len = strbuf_empty_length(s); - /* Add 1 since there is also space to store the terminating NULL. */ - fmt_len = vsnprintf(s->buf + s->length, empty_len + 1, fmt, arg); - va_end(arg); - - if (fmt_len <= empty_len) - break; /* SUCCESS */ - if (try > 0) - die("BUG: length of formatted string changed"); - - strbuf_resize(s, s->length + fmt_len); - } - - s->length += fmt_len; -} /* vi:ai et sw=4 ts=4: */ diff --git a/deps/lua/src/strbuf.h b/deps/lua/src/strbuf.h index d861108c14c..c10f83f0db8 100644 --- a/deps/lua/src/strbuf.h +++ b/deps/lua/src/strbuf.h @@ -27,15 +27,13 @@ /* Size: Total bytes allocated to *buf * Length: String length, excluding optional NULL terminator. - * Increment: Allocation increments when resizing the string buffer. * Dynamic: True if created via strbuf_new() */ typedef struct { char *buf; - int size; - int length; - int increment; + size_t size; + size_t length; int dynamic; int reallocs; int debug; @@ -44,32 +42,26 @@ typedef struct { #ifndef STRBUF_DEFAULT_SIZE #define STRBUF_DEFAULT_SIZE 1023 #endif -#ifndef STRBUF_DEFAULT_INCREMENT -#define STRBUF_DEFAULT_INCREMENT -2 -#endif /* Initialise */ -extern strbuf_t *strbuf_new(int len); -extern void strbuf_init(strbuf_t *s, int len); -extern void strbuf_set_increment(strbuf_t *s, int increment); +extern strbuf_t *strbuf_new(size_t len); +extern void strbuf_init(strbuf_t *s, size_t len); /* Release */ extern void strbuf_free(strbuf_t *s); -extern char *strbuf_free_to_string(strbuf_t *s, int *len); +extern char *strbuf_free_to_string(strbuf_t *s, size_t *len); /* Management */ -extern void strbuf_resize(strbuf_t *s, int len); -static int strbuf_empty_length(strbuf_t *s); -static int strbuf_length(strbuf_t *s); -static char *strbuf_string(strbuf_t *s, int *len); -static void strbuf_ensure_empty_length(strbuf_t *s, int len); +extern void strbuf_resize(strbuf_t *s, size_t len); +static size_t strbuf_empty_length(strbuf_t *s); +static size_t strbuf_length(strbuf_t *s); +static char *strbuf_string(strbuf_t *s, size_t *len); +static void strbuf_ensure_empty_length(strbuf_t *s, size_t len); static char *strbuf_empty_ptr(strbuf_t *s); -static void strbuf_extend_length(strbuf_t *s, int len); +static void strbuf_extend_length(strbuf_t *s, size_t len); /* Update */ -extern void strbuf_append_fmt(strbuf_t *s, int len, const char *fmt, ...); -extern void strbuf_append_fmt_retry(strbuf_t *s, const char *format, ...); -static void strbuf_append_mem(strbuf_t *s, const char *c, int len); +static void strbuf_append_mem(strbuf_t *s, const char *c, size_t len); extern void strbuf_append_string(strbuf_t *s, const char *str); static void strbuf_append_char(strbuf_t *s, const char c); static void strbuf_ensure_null(strbuf_t *s); @@ -87,12 +79,12 @@ static inline int strbuf_allocated(strbuf_t *s) /* Return bytes remaining in the string buffer * Ensure there is space for a NULL terminator. */ -static inline int strbuf_empty_length(strbuf_t *s) +static inline size_t strbuf_empty_length(strbuf_t *s) { return s->size - s->length - 1; } -static inline void strbuf_ensure_empty_length(strbuf_t *s, int len) +static inline void strbuf_ensure_empty_length(strbuf_t *s, size_t len) { if (len > strbuf_empty_length(s)) strbuf_resize(s, s->length + len); @@ -103,12 +95,12 @@ static inline char *strbuf_empty_ptr(strbuf_t *s) return s->buf + s->length; } -static inline void strbuf_extend_length(strbuf_t *s, int len) +static inline void strbuf_extend_length(strbuf_t *s, size_t len) { s->length += len; } -static inline int strbuf_length(strbuf_t *s) +static inline size_t strbuf_length(strbuf_t *s) { return s->length; } @@ -124,14 +116,14 @@ static inline void strbuf_append_char_unsafe(strbuf_t *s, const char c) s->buf[s->length++] = c; } -static inline void strbuf_append_mem(strbuf_t *s, const char *c, int len) +static inline void strbuf_append_mem(strbuf_t *s, const char *c, size_t len) { strbuf_ensure_empty_length(s, len); memcpy(s->buf + s->length, c, len); s->length += len; } -static inline void strbuf_append_mem_unsafe(strbuf_t *s, const char *c, int len) +static inline void strbuf_append_mem_unsafe(strbuf_t *s, const char *c, size_t len) { memcpy(s->buf + s->length, c, len); s->length += len; @@ -142,7 +134,7 @@ static inline void strbuf_ensure_null(strbuf_t *s) s->buf[s->length] = 0; } -static inline char *strbuf_string(strbuf_t *s, int *len) +static inline char *strbuf_string(strbuf_t *s, size_t *len) { if (len) *len = s->length; diff --git a/redis.conf b/redis.conf index 97f077b0d0a..65e01b0742e 100644 --- a/redis.conf +++ b/redis.conf @@ -51,6 +51,7 @@ # # loadmodule /path/to/my_module.so # loadmodule /path/to/other_module.so +# loadmodule /path/to/args_module.so [arg [arg ...]] ################################## NETWORK ##################################### @@ -1162,7 +1163,8 @@ acllog-max-len 128 # configuration directive. # # The default of 5 produces good enough results. 10 Approximates very closely -# true LRU but costs more CPU. 3 is faster but not very accurate. +# true LRU but costs more CPU. 3 is faster but not very accurate. The maximum +# value that can be set is 64. # # maxmemory-samples 5 @@ -1382,6 +1384,10 @@ disable-thp yes # If the AOF is enabled on startup Redis will load the AOF, that is the file # with the better durability guarantees. # +# Note that changing this value in a config file of an existing database and +# restarting the server can lead to data loss. A conversion needs to be done +# by setting it via CONFIG command on a live server first. +# # Please check https://redis.io/topics/persistence for more information. appendonly no @@ -2070,7 +2076,7 @@ client-output-buffer-limit pubsub 32mb 8mb 60 # amount by default in order to avoid that a protocol desynchronization (for # instance due to a bug in the client) will lead to unbound memory usage in # the query buffer. However you can configure it here if you have very special -# needs, such us huge multi/exec requests or alike. +# needs, such as a command with huge argument, or huge multi/exec requests or alike. # # client-query-buffer-limit 1gb @@ -2195,6 +2201,26 @@ rdb-save-incremental-fsync yes # lfu-log-factor 10 # lfu-decay-time 1 + +# The maximum number of new client connections accepted per event-loop cycle. This configuration +# is set independently for TLS connections. +# +# By default, up to 10 new connection will be accepted per event-loop cycle for normal connections +# and up to 1 new connection per event-loop cycle for TLS connections. +# +# Adjusting this to a larger number can slightly improve efficiency for new connections +# at the risk of causing timeouts for regular commands on established connections. It is +# not advised to change this without ensuring that all clients have limited connection +# pools and exponential backoff in the case of command/connection timeouts. +# +# If your application is establishing a large number of new connections per second you should +# also consider tuning the value of tcp-backlog, which allows the kernel to buffer more +# pending connections before dropping or rejecting connections. +# +# max-new-connections-per-cycle 10 +# max-new-tls-connections-per-cycle 1 + + ########################### ACTIVE DEFRAGMENTATION ####################### # # What is active defragmentation? @@ -2276,16 +2302,16 @@ jemalloc-bg-thread yes # the taskset command: # # Set redis server/io threads to cpu affinity 0,2,4,6: -# server_cpulist 0-7:2 +# server-cpulist 0-7:2 # # Set bio threads to cpu affinity 1,3: -# bio_cpulist 1,3 +# bio-cpulist 1,3 # # Set aof rewrite child process to cpu affinity 8,9,10,11: -# aof_rewrite_cpulist 8-11 +# aof-rewrite-cpulist 8-11 # # Set bgsave child process to cpu affinity 1,10,11 -# bgsave_cpulist 1,10-11 +# bgsave-cpulist 1,10-11 # In some cases redis will emit warnings and even refuse to start if it detects # that the system is in bad state, it is possible to suppress these warnings diff --git a/runtest-moduleapi b/runtest-moduleapi index ff685afb66c..910d581f2fc 100755 --- a/runtest-moduleapi +++ b/runtest-moduleapi @@ -55,4 +55,5 @@ $TCLSH tests/test_helper.tcl \ --single unit/moduleapi/async_rm_call \ --single unit/moduleapi/moduleauth \ --single unit/moduleapi/rdbloadsave \ +--single unit/moduleapi/crash \ "${@}" diff --git a/src/Makefile b/src/Makefile index ecbd2753d9f..cc84d09ad76 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,9 @@ # Redis Makefile -# Copyright (C) 2009 Salvatore Sanfilippo -# This file is released under the BSD license, see the COPYING file +# Copyright (c) 2011-Present, Redis Ltd. +# All rights reserved. +# +# Licensed under your choice of the Redis Source Available License 2.0 +# (RSALv2) or the Server Side Public License v1 (SSPLv1). # # The Makefile composes the final FINAL_CFLAGS and FINAL_LDFLAGS using # what is needed for Redis plus the standard CFLAGS and LDFLAGS passed. @@ -16,14 +19,20 @@ release_hdr := $(shell sh -c './mkreleasehdr.sh') uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') CLANG := $(findstring clang,$(shell sh -c '$(CC) --version | head -1')) + +# Optimization flags. To override, the OPTIMIZATION variable can be passed, but +# some automatic defaults are added to it. To specify optimization flags +# explicitly without any defaults added, pass the OPT variable instead. OPTIMIZATION?=-O3 ifeq ($(OPTIMIZATION),-O3) ifeq (clang,$(CLANG)) - REDIS_CFLAGS+=-flto + OPTIMIZATION+=-flto else - REDIS_CFLAGS+=-flto=auto + OPTIMIZATION+=-flto=auto endif - REDIS_LDFLAGS+=-O3 -flto +endif +ifneq ($(OPTIMIZATION),-O0) + OPTIMIZATION+=-fno-omit-frame-pointer endif DEPENDENCY_TARGETS=hiredis linenoise lua hdr_histogram fpconv NODEPS:=clean distclean @@ -117,7 +126,7 @@ endif -include .make-settings FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(REDIS_CFLAGS) -FINAL_LDFLAGS=$(LDFLAGS) $(REDIS_LDFLAGS) $(DEBUG) +FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(REDIS_LDFLAGS) $(DEBUG) FINAL_LIBS=-lm DEBUG=-g -ggdb @@ -345,7 +354,7 @@ endif REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +REDIS_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o mstr.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) @@ -420,7 +429,7 @@ $(TLS_MODULE_NAME): $(REDIS_SERVER_NAME) # redis-cli $(REDIS_CLI_NAME): $(REDIS_CLI_OBJ) - $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/linenoise/linenoise.o $(FINAL_LIBS) $(TLS_CLIENT_LIBS) + $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/linenoise/linenoise.o ../deps/hdr_histogram/libhdrhistogram.a $(FINAL_LIBS) $(TLS_CLIENT_LIBS) # redis-benchmark $(REDIS_BENCHMARK_NAME): $(REDIS_BENCHMARK_OBJ) @@ -435,11 +444,16 @@ DEP = $(REDIS_SERVER_OBJ:%.o=%.d) $(REDIS_CLI_OBJ:%.o=%.d) $(REDIS_BENCHMARK_OBJ %.o: %.c .make-prerequisites $(REDIS_CC) -MMD -o $@ -c $< -# The file commands.def is checked in and doesn't normally need to be rebuilt. It -# is built only if python is available and its prereqs are modified. +# The following files are checked in and don't normally need to be rebuilt. They +# are built only if python is available and their prereqs are modified. ifneq (,$(PYTHON)) $(COMMANDS_DEF_FILENAME).def: commands/*.json ../utils/generate-command-code.py $(QUIET_GEN)$(PYTHON) ../utils/generate-command-code.py $(GEN_COMMANDS_FLAGS) + +fmtargs.h: ../utils/generate-fmtargs.py + $(QUITE_GEN)sed '/Everything below this line/,$$d' $@ > $@.tmp + $(QUITE_GEN)$(PYTHON) ../utils/generate-fmtargs.py >> $@.tmp + $(QUITE_GEN)mv $@.tmp $@ endif commands.c: $(COMMANDS_DEF_FILENAME).def diff --git a/src/acl.c b/src/acl.c index aa42c58dcff..c2cca0f3fdd 100644 --- a/src/acl.c +++ b/src/acl.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2018, Salvatore Sanfilippo + * Copyright (c) 2018-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -59,10 +38,12 @@ static rax *commandId = NULL; /* Command name to id mapping */ static unsigned long nextid = 0; /* Next command id that has not been assigned */ +#define ACL_MAX_CATEGORIES 64 /* Maximum number of command categories */ + struct ACLCategoryItem { - const char *name; + char *name; uint64_t flag; -} ACLCommandCategories[] = { /* See redis.conf for details on each category. */ +} ACLDefaultCommandCategories[] = { /* See redis.conf for details on each category. */ {"keyspace", ACL_CATEGORY_KEYSPACE}, {"read", ACL_CATEGORY_READ}, {"write", ACL_CATEGORY_WRITE}, @@ -87,6 +68,54 @@ struct ACLCategoryItem { {NULL,0} /* Terminator. */ }; +static struct ACLCategoryItem *ACLCommandCategories = NULL; +static size_t nextCommandCategory = 0; /* Index of the next command category to be added */ + +/* Implements the ability to add to the list of ACL categories at runtime. Since each ACL category + * also requires a bit in the acl_categories flag, there is a limit to the number that can be added. + * The new ACL categories occupy the remaining bits of acl_categories flag, other than the bits + * occupied by the default ACL command categories. + * + * The optional `flag` argument allows the assignment of the `acl_categories` flag bit to the ACL category. + * When adding a new category, except for the default ACL command categories, this arguments should be `0` + * to allow the function to assign the next available `acl_categories` flag bit to the new ACL category. + * + * returns 1 -> Added, 0 -> Failed (out of space) + * + * This function is present here to gain access to the ACLCommandCategories array and add a new ACL category. + */ +int ACLAddCommandCategory(const char *name, uint64_t flag) { + if (nextCommandCategory >= ACL_MAX_CATEGORIES) return 0; + ACLCommandCategories[nextCommandCategory].name = zstrdup(name); + ACLCommandCategories[nextCommandCategory].flag = flag != 0 ? flag : (1ULL<name = sdsnewlen(name,namelen); u->flags = USER_FLAG_DISABLED; @@ -456,15 +485,7 @@ void ACLFreeUserAndKillClients(user *u) { * this may result in some security hole: it's much * more defensive to set the default user and put * it in non authenticated mode. */ - c->user = DefaultUser; - c->authenticated = 0; - /* We will write replies to this client later, so we can't - * close it directly even if async. */ - if (c == server.current_client) { - c->flags |= CLIENT_CLOSE_AFTER_COMMAND; - } else { - freeClientAsync(c); - } + deauthenticateAndCloseClient(c); } } ACLFreeUser(u); @@ -489,12 +510,6 @@ void ACLCopyUser(user *dst, user *src) { } } -/* Free all the users registered in the radix tree 'users' and free the - * radix tree itself. */ -void ACLFreeUsersSet(rax *users) { - raxFreeWithCallback(users,(void(*)(void*))ACLFreeUserAndKillClients); -} - /* Given a command ID, this function set by reference 'word' and 'bit' * so that user->allowed_commands[word] will address the right word * where the corresponding bit for the provided ID is stored, and @@ -563,7 +578,7 @@ void ACLSelectorRemoveCommandRule(aclSelector *selector, sds new_rule) { * as well if the command is removed. */ char *rule_end = strchr(existing_rule, ' '); if (!rule_end) { - /* This is the last rule, so it it to the end of the string. */ + /* This is the last rule, so move it to the end of the string. */ rule_end = existing_rule + strlen(existing_rule); /* This approach can leave a trailing space if the last rule is removed, @@ -580,6 +595,8 @@ void ACLSelectorRemoveCommandRule(aclSelector *selector, sds new_rule) { /* Copy the remaining rules starting at the next rule to replace the rule to be * deleted, including the terminating NULL character. */ memmove(copy_position, copy_end, strlen(copy_end) + 1); + existing_rule = copy_position; + continue; } } existing_rule = copy_end; @@ -911,7 +928,7 @@ void ACLResetFirstArgs(aclSelector *selector) { selector->allowed_firstargs = NULL; } -/* Add a first-arh to the list of subcommands for the user 'u' and +/* Add a first-arg to the list of subcommands for the user 'u' and * the command id specified. */ void ACLAddAllowedFirstArg(aclSelector *selector, unsigned long id, const char *sub) { /* If this is the first first-arg to be configured for @@ -1396,6 +1413,7 @@ user *ACLCreateDefaultUser(void) { void ACLInit(void) { Users = raxNew(); UsersToLoad = listCreate(); + ACLInitCommandCategories(); listSetMatchMethod(UsersToLoad, ACLListMatchLoadedUser); ACLLog = listCreate(); DefaultUser = ACLCreateDefaultUser(); @@ -1405,7 +1423,7 @@ void ACLInit(void) { * otherwise C_ERR is returned and errno is set to: * * EINVAL: if the username-password do not match. - * ENONENT: if the specified user does not exist at all. + * ENOENT: if the specified user does not exist at all. */ int ACLCheckUserCredentials(robj *username, robj *password) { user *u = ACLGetUserByName(username->ptr,sdslen(username->ptr)); @@ -1500,8 +1518,8 @@ unsigned long ACLGetCommandID(sds cmdname) { sds lowername = sdsdup(cmdname); sdstolower(lowername); if (commandId == NULL) commandId = raxNew(); - void *id = raxFind(commandId,(unsigned char*)lowername,sdslen(lowername)); - if (id != raxNotFound) { + void *id; + if (raxFind(commandId,(unsigned char*)lowername,sdslen(lowername),&id)) { sdsfree(lowername); return (unsigned long)id; } @@ -1532,8 +1550,8 @@ void ACLClearCommandID(void) { /* Return an username by its name, or NULL if the user does not exist. */ user *ACLGetUserByName(const char *name, size_t namelen) { - void *myuser = raxFind(Users,(unsigned char*)name,namelen); - if (myuser == raxNotFound) return NULL; + void *myuser = NULL; + raxFind(Users,(unsigned char*)name,namelen,&myuser); return myuser; } @@ -1856,23 +1874,20 @@ int ACLCheckAllPerm(client *c, int *idxptr) { return ACLCheckAllUserCommandPerm(c->user, c->cmd, c->argv, c->argc, idxptr); } -/* Check if the user's existing pub/sub clients violate the ACL pub/sub - * permissions specified via the upcoming argument, and kill them if so. */ -void ACLKillPubsubClientsIfNeeded(user *new, user *original) { +/* If 'new' can access all channels 'original' could then return NULL; + Otherwise return a list of channels that the new user can access */ +list *getUpcomingChannelList(user *new, user *original) { listIter li, lpi; listNode *ln, *lpn; - robj *o; - int kill = 0; - - /* First optimization is we check if any selector has all channel - * permissions. */ + + /* Optimization: we check if any selector has all channel permissions. */ listRewind(new->selectors,&li); while((ln = listNext(&li))) { aclSelector *s = (aclSelector *) listNodeValue(ln); - if (s->flags & SELECTOR_FLAG_ALLCHANNELS) return; + if (s->flags & SELECTOR_FLAG_ALLCHANNELS) return NULL; } - /* Second optimization is to check if the new list of channels + /* Next, check if the new list of channels * is a strict superset of the original. This is done by * created an "upcoming" list of all channels that are in * the new user and checking each of the existing channels @@ -1910,58 +1925,87 @@ void ACLKillPubsubClientsIfNeeded(user *new, user *original) { if (match) { /* All channels were matched, no need to kill clients. */ listRelease(upcoming); - return; + return NULL; } - - /* Permissions have changed, so we need to iterate through all - * the clients and disconnect those that are no longer valid. - * Scan all connected clients to find the user's pub/subs. */ - listRewind(server.clients,&li); - while ((ln = listNext(&li)) != NULL) { - client *c = listNodeValue(ln); - kill = 0; - if (c->user == original && getClientType(c) == CLIENT_TYPE_PUBSUB) { - /* Check for pattern violations. */ - dictIterator *di = dictGetIterator(c->pubsub_patterns); - dictEntry *de; + return upcoming; +} + +/* Check if the client should be killed because it is subscribed to channels that were + * permitted in the past, are not in the `upcoming` channel list. */ +int ACLShouldKillPubsubClient(client *c, list *upcoming) { + robj *o; + int kill = 0; + + if (getClientType(c) == CLIENT_TYPE_PUBSUB) { + /* Check for pattern violations. */ + dictIterator *di = dictGetIterator(c->pubsub_patterns); + dictEntry *de; + while (!kill && ((de = dictNext(di)) != NULL)) { + o = dictGetKey(de); + int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 1); + kill = (res == ACL_DENIED_CHANNEL); + } + dictReleaseIterator(di); + + /* Check for channel violations. */ + if (!kill) { + /* Check for global channels violation. */ + di = dictGetIterator(c->pubsub_channels); + while (!kill && ((de = dictNext(di)) != NULL)) { o = dictGetKey(de); - int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 1); + int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); kill = (res == ACL_DENIED_CHANNEL); } dictReleaseIterator(di); - - /* Check for channel violations. */ - if (!kill) { - /* Check for global channels violation. */ - di = dictGetIterator(c->pubsub_channels); - while (!kill && ((de = dictNext(di)) != NULL)) { - o = dictGetKey(de); - int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); - kill = (res == ACL_DENIED_CHANNEL); - } - dictReleaseIterator(di); - } - - if (!kill) { - /* Check for shard channels violation. */ - di = dictGetIterator(c->pubsubshard_channels); - while (!kill && ((de = dictNext(di)) != NULL)) { - o = dictGetKey(de); - int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); - kill = (res == ACL_DENIED_CHANNEL); - } - dictReleaseIterator(di); + } + if (!kill) { + /* Check for shard channels violation. */ + di = dictGetIterator(c->pubsubshard_channels); + while (!kill && ((de = dictNext(di)) != NULL)) { + o = dictGetKey(de); + int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); + kill = (res == ACL_DENIED_CHANNEL); } + dictReleaseIterator(di); + } - /* Kill it. */ - if (kill) { - freeClient(c); - } + if (kill) { + return 1; } } - listRelease(upcoming); + return 0; +} + +/* Check if the user's existing pub/sub clients violate the ACL pub/sub + * permissions specified via the upcoming argument, and kill them if so. */ +void ACLKillPubsubClientsIfNeeded(user *new, user *original) { + /* Do nothing if there are no subscribers. */ + if (pubsubTotalSubscriptions() == 0) + return; + + list *channels = getUpcomingChannelList(new, original); + /* If the new user's pubsub permissions are a strict superset of the original, return early. */ + if (!channels) + return; + + listIter li; + listNode *ln; + + /* Permissions have changed, so we need to iterate through all + * the clients and disconnect those that are no longer valid. + * Scan all connected clients to find the user's pub/subs. */ + listRewind(server.clients,&li); + while ((ln = listNext(&li)) != NULL) { + client *c = listNodeValue(ln); + if (c->user != original) + continue; + if (ACLShouldKillPubsubClient(c, channels)) + deauthenticateAndCloseClient(c); + } + + listRelease(channels); } /* ============================================================================= @@ -1990,7 +2034,8 @@ sds *ACLMergeSelectorArguments(sds *argv, int argc, int *merged_argc, int *inval for (int j = 0; j < argc; j++) { char *op = argv[j]; - if (op[0] == '(' && op[sdslen(op) - 1] != ')') { + if (open_bracket_start == -1 && + (op[0] == '(' && op[sdslen(op) - 1] != ')')) { selector = sdsdup(argv[j]); open_bracket_start = j; continue; @@ -2367,11 +2412,46 @@ sds ACLLoadFromFile(const char *filename) { ACLFreeUser(new_default); raxInsert(Users,(unsigned char*)"default",7,DefaultUser,NULL); raxRemove(old_users,(unsigned char*)"default",7,NULL); - ACLFreeUsersSet(old_users); + + /* If there are some subscribers, we need to check if we need to drop some clients. */ + rax *user_channels = NULL; + if (pubsubTotalSubscriptions() > 0) { + user_channels = raxNew(); + } + + listIter li; + listNode *ln; + + listRewind(server.clients,&li); + while ((ln = listNext(&li)) != NULL) { + client *c = listNodeValue(ln); + /* a MASTER client can do everything (and user = NULL) so we can skip it */ + if (c->flags & CLIENT_MASTER) + continue; + user *original = c->user; + list *channels = NULL; + user *new = ACLGetUserByName(c->user->name, sdslen(c->user->name)); + if (new && user_channels) { + if (!raxFind(user_channels, (unsigned char*)(new->name), sdslen(new->name), (void**)&channels)) { + channels = getUpcomingChannelList(new, original); + raxInsert(user_channels, (unsigned char*)(new->name), sdslen(new->name), channels, NULL); + } + } + /* When the new channel list is NULL, it means the new user's channel list is a superset of the old user's list. */ + if (!new || (channels && ACLShouldKillPubsubClient(c, channels))) { + deauthenticateAndCloseClient(c); + continue; + } + c->user = new; + } + + if (user_channels) + raxFreeWithCallback(user_channels, (void(*)(void*))listRelease); + raxFreeWithCallback(old_users,(void(*)(void*))ACLFreeUser); sdsfree(errors); return NULL; } else { - ACLFreeUsersSet(Users); + raxFreeWithCallback(Users,(void(*)(void*))ACLFreeUser); Users = old_users; errors = sdscat(errors,"WARNING: ACL errors detected, no change to the previously active ACL rules was performed"); return errors; @@ -2549,6 +2629,15 @@ void ACLUpdateInfoMetrics(int reason){ } } +static void trimACLLogEntriesToMaxLen(void) { + while(listLength(ACLLog) > server.acllog_max_len) { + listNode *ln = listLast(ACLLog); + ACLLogEntry *le = listNodeValue(ln); + ACLFreeLogEntry(le); + listDelNode(ACLLog,ln); + } +} + /* Adds a new entry in the ACL log, making sure to delete the old entry * if we reach the maximum length allowed for the log. This function attempts * to find similar entries in the current log in order to bump the counter of @@ -2568,6 +2657,11 @@ void addACLLogEntry(client *c, int reason, int context, int argpos, sds username /* Update ACL info metrics */ ACLUpdateInfoMetrics(reason); + if (server.acllog_max_len == 0) { + trimACLLogEntriesToMaxLen(); + return; + } + /* Create a new entry. */ struct ACLLogEntry *le = zmalloc(sizeof(*le)); le->count = 1; @@ -2630,12 +2724,7 @@ void addACLLogEntry(client *c, int reason, int context, int argpos, sds username * to its maximum size. */ ACLLogEntryCount++; /* Incrementing the entry_id count to make each record in the log unique. */ listAddNodeHead(ACLLog, le); - while(listLength(ACLLog) > server.acllog_max_len) { - listNode *ln = listLast(ACLLog); - ACLLogEntry *le = listNodeValue(ln); - ACLFreeLogEntry(le); - listDelNode(ACLLog,ln); - } + trimACLLogEntriesToMaxLen(); } } @@ -2759,8 +2848,7 @@ void aclCommand(client *c) { sds username = c->argv[2]->ptr; /* Check username validity. */ if (ACLStringHasSpaces(username,sdslen(username))) { - addReplyErrorFormat(c, - "Usernames can't contain spaces or null characters"); + addReplyError(c, "Usernames can't contain spaces or null characters"); return; } @@ -2778,6 +2866,10 @@ void aclCommand(client *c) { } return; } else if (!strcasecmp(sub,"deluser") && c->argc >= 3) { + /* Initially redact all the arguments to not leak any information + * about the users. */ + for (int j = 2; j < c->argc; j++) redactClientCommandArgument(c, j); + int deleted = 0; for (int j = 2; j < c->argc; j++) { sds username = c->argv[j]->ptr; @@ -2800,6 +2892,9 @@ void aclCommand(client *c) { } addReplyLongLong(c,deleted); } else if (!strcasecmp(sub,"getuser") && c->argc == 3) { + /* Redact the username to not leak any information about the user. */ + redactClientCommandArgument(c, 2); + user *u = ACLGetUserByName(c->argv[2]->ptr,sdslen(c->argv[2]->ptr)); if (u == NULL) { addReplyNull(c); diff --git a/src/adlist.c b/src/adlist.c index f031c46e87d..0e8f6d5c14a 100644 --- a/src/adlist.c +++ b/src/adlist.c @@ -1,31 +1,10 @@ /* adlist.c - A generic doubly linked list implementation * - * Copyright (c) 2006-2010, Salvatore Sanfilippo + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ @@ -76,6 +55,8 @@ void listEmpty(list *list) * This function can't fail. */ void listRelease(list *list) { + if (!list) + return; listEmpty(list); zfree(list); } diff --git a/src/adlist.h b/src/adlist.h index 7c5443769b3..b91fe5070ef 100644 --- a/src/adlist.h +++ b/src/adlist.h @@ -1,31 +1,10 @@ /* adlist.h - A generic doubly linked list implementation * - * Copyright (c) 2006-2012, Salvatore Sanfilippo + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __ADLIST_H__ diff --git a/src/ae.c b/src/ae.c index 1b6422b2db8..3d3569865ae 100644 --- a/src/ae.c +++ b/src/ae.c @@ -2,32 +2,11 @@ * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated * it in form of a library for easy reuse. * - * Copyright (c) 2006-2010, Salvatore Sanfilippo + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "ae.h" @@ -149,6 +128,8 @@ void aeDeleteEventLoop(aeEventLoop *eventLoop) { aeTimeEvent *next_te, *te = eventLoop->timeEventHead; while (te) { next_te = te->next; + if (te->finalizerProc) + te->finalizerProc(eventLoop, te->clientData); zfree(te); te = next_te; } @@ -333,7 +314,7 @@ static int processTimeEvents(aeEventLoop *eventLoop) { processed++; now = getMonotonicUs(); if (retval != AE_NOMORE) { - te->when = now + retval * 1000; + te->when = now + (monotime)retval * 1000; } else { te->id = AE_DELETED_EVENT_ID; } @@ -343,8 +324,8 @@ static int processTimeEvents(aeEventLoop *eventLoop) { return processed; } -/* Process every pending time event, then every pending file event - * (that may be registered by time event callbacks just processed). +/* Process every pending file event, then every pending time event + * (that may be registered by file event callbacks just processed). * Without special flags the function sleeps until some file event * fires, or when the next time event occurs (if any). * diff --git a/src/ae.h b/src/ae.h index 70ce8a2d543..5f1e17f7dc7 100644 --- a/src/ae.h +++ b/src/ae.h @@ -2,32 +2,11 @@ * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated * it in form of a library for easy reuse. * - * Copyright (c) 2006-2012, Salvatore Sanfilippo + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __AE_H__ diff --git a/src/ae_epoll.c b/src/ae_epoll.c index 493ffcad2eb..d03d694feaf 100644 --- a/src/ae_epoll.c +++ b/src/ae_epoll.c @@ -1,31 +1,10 @@ /* Linux epoll(2) based ae.c module * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ diff --git a/src/ae_select.c b/src/ae_select.c index f8ef959662c..63d0dfb82a9 100644 --- a/src/ae_select.c +++ b/src/ae_select.c @@ -1,31 +1,10 @@ /* Select()-based ae.c module. * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ diff --git a/src/anet.c b/src/anet.c index 790ea7e0aca..705b9e5ce7f 100644 --- a/src/anet.c +++ b/src/anet.c @@ -1,31 +1,10 @@ /* anet.c -- Basic TCP socket stuff made a bit less boring * - * Copyright (c) 2006-2012, Salvatore Sanfilippo + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "fmacros.h" @@ -82,7 +61,7 @@ int anetSetBlock(char *err, int fd, int non_block) { return ANET_ERR; } - /* Check if this flag has been set or unset, if so, + /* Check if this flag has been set or unset, if so, * then there is no need to call fcntl to set/unset it again. */ if (!!(flags & O_NONBLOCK) == !!non_block) return ANET_OK; @@ -107,8 +86,8 @@ int anetBlock(char *err, int fd) { return anetSetBlock(err,fd,0); } -/* Enable the FD_CLOEXEC on the given fd to avoid fd leaks. - * This function should be invoked for fd's on specific places +/* Enable the FD_CLOEXEC on the given fd to avoid fd leaks. + * This function should be invoked for fd's on specific places * where fork + execve system calls are called. */ int anetCloexec(int fd) { int r; @@ -130,57 +109,145 @@ int anetCloexec(int fd) { return r; } -/* Set TCP keep alive option to detect dead peers. The interval option - * is only used for Linux as we are using Linux-specific APIs to set - * the probe send time, interval, and count. */ +/* Enable TCP keep-alive mechanism to detect dead peers, + * TCP_KEEPIDLE, TCP_KEEPINTVL and TCP_KEEPCNT will be set accordingly. */ int anetKeepAlive(char *err, int fd, int interval) { - int val = 1; - - if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)) == -1) + int enabled = 1; + if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &enabled, sizeof(enabled))) { anetSetError(err, "setsockopt SO_KEEPALIVE: %s", strerror(errno)); return ANET_ERR; } -#ifdef __linux__ + int idle; + int intvl; + int cnt; + + /* There are platforms that are expected to support the full mechanism of TCP keep-alive, + * we want the compiler to emit warnings of unused variables if the preprocessor directives + * somehow fail, and other than those platforms, just omit these warnings if they happen. + */ +#if !(defined(_AIX) || defined(__APPLE__) || defined(__DragonFly__) || \ + defined(__FreeBSD__) || defined(__illumos__) || defined(__linux__) || \ + defined(__NetBSD__) || defined(__sun)) + UNUSED(interval); + UNUSED(idle); + UNUSED(intvl); + UNUSED(cnt); +#endif + +#ifdef __sun + /* The implementation of TCP keep-alive on Solaris/SmartOS is a bit unusual + * compared to other Unix-like systems. + * Thus, we need to specialize it on Solaris. + * + * There are two keep-alive mechanisms on Solaris: + * - By default, the first keep-alive probe is sent out after a TCP connection is idle for two hours. + * If the peer does not respond to the probe within eight minutes, the TCP connection is aborted. + * You can alter the interval for sending out the first probe using the socket option TCP_KEEPALIVE_THRESHOLD + * in milliseconds or TCP_KEEPIDLE in seconds. + * The system default is controlled by the TCP ndd parameter tcp_keepalive_interval. The minimum value is ten seconds. + * The maximum is ten days, while the default is two hours. If you receive no response to the probe, + * you can use the TCP_KEEPALIVE_ABORT_THRESHOLD socket option to change the time threshold for aborting a TCP connection. + * The option value is an unsigned integer in milliseconds. The value zero indicates that TCP should never time out and + * abort the connection when probing. The system default is controlled by the TCP ndd parameter tcp_keepalive_abort_interval. + * The default is eight minutes. + * + * - The second implementation is activated if socket option TCP_KEEPINTVL and/or TCP_KEEPCNT are set. + * The time between each consequent probes is set by TCP_KEEPINTVL in seconds. + * The minimum value is ten seconds. The maximum is ten days, while the default is two hours. + * The TCP connection will be aborted after certain amount of probes, which is set by TCP_KEEPCNT, without receiving response. + */ + + idle = interval; + if (idle < 10) idle = 10; // kernel expects at least 10 seconds + if (idle > 10*24*60*60) idle = 10*24*60*60; // kernel expects at most 10 days + + /* `TCP_KEEPIDLE`, `TCP_KEEPINTVL`, and `TCP_KEEPCNT` were not available on Solaris + * until version 11.4, but let's take a chance here. */ +#if defined(TCP_KEEPIDLE) && defined(TCP_KEEPINTVL) && defined(TCP_KEEPCNT) + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &idle, sizeof(idle))) { + anetSetError(err, "setsockopt TCP_KEEPIDLE: %s\n", strerror(errno)); + return ANET_ERR; + } + + intvl = idle/3; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &intvl, sizeof(intvl))) { + anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); + return ANET_ERR; + } + + cnt = 3; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &cnt, sizeof(cnt))) { + anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); + return ANET_ERR; + } +#else + /* Fall back to the first implementation of tcp-alive mechanism for older Solaris, + * simulate the tcp-alive mechanism on other platforms via `TCP_KEEPALIVE_THRESHOLD` + `TCP_KEEPALIVE_ABORT_THRESHOLD`. + */ + idle *= 1000; // kernel expects milliseconds + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE_THRESHOLD, &idle, sizeof(idle))) { + anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); + return ANET_ERR; + } + + /* Note that the consequent probes will not be sent at equal intervals on Solaris, + * but will be sent using the exponential backoff algorithm. */ + intvl = idle/3; + cnt = 3; + int time_to_abort = intvl * cnt; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE_ABORT_THRESHOLD, &time_to_abort, sizeof(time_to_abort))) { + anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); + return ANET_ERR; + } +#endif + + return ANET_OK; + +#endif + +#ifdef TCP_KEEPIDLE /* Default settings are more or less garbage, with the keepalive time - * set to 7200 by default on Linux. Modify settings to make the feature - * actually useful. */ + * set to 7200 by default on Linux and other Unix-like systems. + * Modify settings to make the feature actually useful. */ /* Send first probe after interval. */ - val = interval; - if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &val, sizeof(val)) < 0) { + idle = interval; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &idle, sizeof(idle))) { anetSetError(err, "setsockopt TCP_KEEPIDLE: %s\n", strerror(errno)); return ANET_ERR; } +#elif defined(TCP_KEEPALIVE) + /* Darwin/macOS uses TCP_KEEPALIVE in place of TCP_KEEPIDLE. */ + idle = interval; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, &idle, sizeof(idle))) { + anetSetError(err, "setsockopt TCP_KEEPALIVE: %s\n", strerror(errno)); + return ANET_ERR; + } +#endif +#ifdef TCP_KEEPINTVL /* Send next probes after the specified interval. Note that we set the * delay as interval / 3, as we send three probes before detecting * an error (see the next setsockopt call). */ - val = interval/3; - if (val == 0) val = 1; - if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &val, sizeof(val)) < 0) { + intvl = interval/3; + if (intvl == 0) intvl = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &intvl, sizeof(intvl))) { anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); return ANET_ERR; } +#endif +#ifdef TCP_KEEPCNT /* Consider the socket in error state after three we send three ACK * probes without getting a reply. */ - val = 3; - if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &val, sizeof(val)) < 0) { + cnt = 3; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &cnt, sizeof(cnt))) { anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); return ANET_ERR; } -#elif defined(__APPLE__) - /* Set idle time with interval */ - val = interval; - if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, &val, sizeof(val)) < 0) { - anetSetError(err, "setsockopt TCP_KEEPALIVE: %s\n", strerror(errno)); - return ANET_ERR; - } -#else - ((void) interval); /* Avoid unused var warning for non Linux systems. */ #endif return ANET_OK; @@ -239,7 +306,11 @@ int anetRecvTimeout(char *err, int fd, long long ms) { * * If flags is set to ANET_IP_ONLY the function only resolves hostnames * that are actually already IPv4 or IPv6 addresses. This turns the function - * into a validating / normalizing function. */ + * into a validating / normalizing function. + * + * If the flag ANET_PREFER_IPV4 is set, IPv4 is preferred over IPv6. + * If the flag ANET_PREFER_IPV6 is set, IPv6 is preferred over IPv4. + * */ int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len, int flags) { @@ -249,9 +320,20 @@ int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len, memset(&hints,0,sizeof(hints)); if (flags & ANET_IP_ONLY) hints.ai_flags = AI_NUMERICHOST; hints.ai_family = AF_UNSPEC; + if (flags & ANET_PREFER_IPV4 && !(flags & ANET_PREFER_IPV6)) { + hints.ai_family = AF_INET; + } else if (flags & ANET_PREFER_IPV6 && !(flags & ANET_PREFER_IPV4)) { + hints.ai_family = AF_INET6; + } hints.ai_socktype = SOCK_STREAM; /* specify socktype to avoid dups */ - if ((rv = getaddrinfo(host, NULL, &hints, &info)) != 0) { + rv = getaddrinfo(host, NULL, &hints, &info); + if (rv != 0 && hints.ai_family != AF_UNSPEC) { + /* Try the other IP version. */ + hints.ai_family = (hints.ai_family == AF_INET) ? AF_INET6 : AF_INET; + rv = getaddrinfo(host, NULL, &hints, &info); + } + if (rv != 0) { anetSetError(err, "%s", gai_strerror(rv)); return ANET_ERR; } @@ -417,13 +499,16 @@ int anetUnixGenericConnect(char *err, const char *path, int flags) return s; } -static int anetListen(char *err, int s, struct sockaddr *sa, socklen_t len, int backlog) { +static int anetListen(char *err, int s, struct sockaddr *sa, socklen_t len, int backlog, mode_t perm) { if (bind(s,sa,len) == -1) { anetSetError(err, "bind: %s", strerror(errno)); close(s); return ANET_ERR; } + if (sa->sa_family == AF_LOCAL && perm) + chmod(((struct sockaddr_un *) sa)->sun_path, perm); + if (listen(s, backlog) == -1) { anetSetError(err, "listen: %s", strerror(errno)); close(s); @@ -467,7 +552,7 @@ static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backl if (af == AF_INET6 && anetV6Only(err,s) == ANET_ERR) goto error; if (anetSetReuseAddr(err,s) == ANET_ERR) goto error; - if (anetListen(err,s,p->ai_addr,p->ai_addrlen,backlog) == ANET_ERR) s = ANET_ERR; + if (anetListen(err,s,p->ai_addr,p->ai_addrlen,backlog,0) == ANET_ERR) s = ANET_ERR; goto end; } if (p == NULL) { @@ -508,10 +593,8 @@ int anetUnixServer(char *err, char *path, mode_t perm, int backlog) memset(&sa,0,sizeof(sa)); sa.sun_family = AF_LOCAL; redis_strlcpy(sa.sun_path,path,sizeof(sa.sun_path)); - if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa),backlog) == ANET_ERR) + if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa),backlog,perm) == ANET_ERR) return ANET_ERR; - if (perm) - chmod(sa.sun_path, perm); return s; } diff --git a/src/anet.h b/src/anet.h index b13c14f7758..8ad5f4b0bab 100644 --- a/src/anet.h +++ b/src/anet.h @@ -1,31 +1,10 @@ /* anet.c -- Basic TCP socket stuff made a bit less boring * - * Copyright (c) 2006-2012, Salvatore Sanfilippo + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef ANET_H @@ -40,6 +19,8 @@ /* Flags used with certain functions. */ #define ANET_NONE 0 #define ANET_IP_ONLY (1<<0) +#define ANET_PREFER_IPV4 (1<<1) +#define ANET_PREFER_IPV6 (1<<2) #if defined(__sun) || defined(_AIX) #define AF_LOCAL AF_UNIX diff --git a/src/aof.c b/src/aof.c index 468d577f8e9..ec631c0e214 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -117,7 +96,9 @@ aofInfo *aofInfoDup(aofInfo *orig) { return ai; } -/* Format aofInfo as a string and it will be a line in the manifest. */ +/* Format aofInfo as a string and it will be a line in the manifest. + * + * When update this format, make sure to update redis-check-aof as well. */ sds aofInfoFormat(sds buf, aofInfo *ai) { sds filename_repr = NULL; @@ -833,7 +814,7 @@ int openNewIncrAofForAppend(void) { * is already synced at this point so fsync doesn't matter. */ if (server.aof_fd != -1) { aof_background_fsync_and_close(server.aof_fd); - server.aof_last_fsync = server.unixtime; + server.aof_last_fsync = server.mstime; } server.aof_fd = newfd; @@ -954,7 +935,7 @@ void stopAppendOnly(void) { if (redis_fsync(server.aof_fd) == -1) { serverLog(LL_WARNING,"Fail to fsync the AOF file: %s",strerror(errno)); } else { - server.aof_last_fsync = server.unixtime; + server.aof_last_fsync = server.mstime; } close(server.aof_fd); @@ -976,18 +957,6 @@ void stopAppendOnly(void) { int startAppendOnly(void) { serverAssert(server.aof_state == AOF_OFF); - /* Wait for all bio jobs related to AOF to drain. This prevents a race - * between updates to `fsynced_reploff_pending` of the worker thread, belonging - * to the previous AOF, and the new one. This concern is specific for a full - * sync scenario where we don't wanna risk the ACKed replication offset - * jumping backwards or forward when switching to a different master. */ - bioDrainWorker(BIO_AOF_FSYNC); - - /* Set the initial repl_offset, which will be applied to fsynced_reploff - * when AOFRW finishes (after possibly being updated by a bio thread) */ - atomicSet(server.fsynced_reploff_pending, server.master_repl_offset); - server.fsynced_reploff = 0; - server.aof_state = AOF_WAIT_REWRITE; if (hasActiveChildProcess() && server.child_type != CHILD_TYPE_AOF) { server.aof_rewrite_scheduled = 1; @@ -1010,7 +979,7 @@ int startAppendOnly(void) { return C_ERR; } } - server.aof_last_fsync = server.unixtime; + server.aof_last_fsync = server.mstime; /* If AOF fsync error in bio job, we just ignore it and log the event. */ int aof_bio_fsync_status; atomicGet(server.aof_bio_fsync_status, aof_bio_fsync_status); @@ -1086,7 +1055,7 @@ void flushAppendOnlyFile(int force) { * the data in page cache cannot be flushed in time. */ if (server.aof_fsync == AOF_FSYNC_EVERYSEC && server.aof_last_incr_fsync_offset != server.aof_last_incr_size && - server.unixtime > server.aof_last_fsync && + server.mstime - server.aof_last_fsync >= 1000 && !(sync_in_progress = aofFsyncInProgress())) { goto try_fsync; @@ -1099,6 +1068,13 @@ void flushAppendOnlyFile(int force) { { goto try_fsync; } else { + /* All data is fsync'd already: Update fsynced_reploff_pending just in case. + * This is needed to avoid a WAITAOF hang in case a module used RM_Call with the NO_AOF flag, + * in which case master_repl_offset will increase but fsynced_reploff_pending won't be updated + * (because there's no reason, from the AOF POV, to call fsync) and then WAITAOF may wait on + * the higher offset (which contains data that was only propagated to replicas, and not to AOF) */ + if (!sync_in_progress && server.aof_fsync != AOF_FSYNC_NO) + atomicSet(server.fsynced_reploff_pending, server.master_repl_offset); return; } } @@ -1114,9 +1090,9 @@ void flushAppendOnlyFile(int force) { if (server.aof_flush_postponed_start == 0) { /* No previous write postponing, remember that we are * postponing the flush and return. */ - server.aof_flush_postponed_start = server.unixtime; + server.aof_flush_postponed_start = server.mstime; return; - } else if (server.unixtime - server.aof_flush_postponed_start < 2) { + } else if (server.mstime - server.aof_flush_postponed_start < 2000) { /* We were already waiting for fsync to finish, but for less * than two seconds this is still ok. Postpone again. */ return; @@ -1265,15 +1241,15 @@ void flushAppendOnlyFile(int force) { latencyEndMonitor(latency); latencyAddSampleIfNeeded("aof-fsync-always",latency); server.aof_last_incr_fsync_offset = server.aof_last_incr_size; - server.aof_last_fsync = server.unixtime; + server.aof_last_fsync = server.mstime; atomicSet(server.fsynced_reploff_pending, server.master_repl_offset); } else if (server.aof_fsync == AOF_FSYNC_EVERYSEC && - server.unixtime > server.aof_last_fsync) { + server.mstime - server.aof_last_fsync >= 1000) { if (!sync_in_progress) { aof_background_fsync(server.aof_fd); server.aof_last_incr_fsync_offset = server.aof_last_incr_size; } - server.aof_last_fsync = server.unixtime; + server.aof_last_fsync = server.mstime; } } @@ -1859,6 +1835,7 @@ int rewriteSetObject(rio *r, robj *key, robj *o) { !rioWriteBulkString(r,"SADD",4) || !rioWriteBulkObject(r,key)) { + setTypeReleaseIterator(si); return 0; } } @@ -1962,19 +1939,21 @@ int rewriteSortedSetObject(rio *r, robj *key, robj *o) { * * The function returns 0 on error, non-zero on success. */ static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { - if (hi->encoding == OBJ_ENCODING_LISTPACK) { + if ((hi->encoding == OBJ_ENCODING_LISTPACK) || (hi->encoding == OBJ_ENCODING_LISTPACK_EX)) { unsigned char *vstr = NULL; unsigned int vlen = UINT_MAX; long long vll = LLONG_MAX; - hashTypeCurrentFromListpack(hi, what, &vstr, &vlen, &vll); + hashTypeCurrentFromListpack(hi, what, &vstr, &vlen, &vll, NULL); if (vstr) return rioWriteBulkString(r, (char*)vstr, vlen); else return rioWriteBulkLongLong(r, vll); } else if (hi->encoding == OBJ_ENCODING_HT) { - sds value = hashTypeCurrentFromHashTable(hi, what); - return rioWriteBulkString(r, value, sdslen(value)); + char *str; + size_t len; + hashTypeCurrentFromHashTable(hi, what, &str, &len, NULL); + return rioWriteBulkString(r, str, len); } serverPanic("Unknown hash encoding"); @@ -1984,37 +1963,60 @@ static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { /* Emit the commands needed to rebuild a hash object. * The function returns 0 on error, 1 on success. */ int rewriteHashObject(rio *r, robj *key, robj *o) { + int res = 0; /*fail*/ + hashTypeIterator *hi; - long long count = 0, items = hashTypeLength(o); + long long count = 0, items = hashTypeLength(o, 0); + int isHFE = hashTypeGetMinExpire(o, 0) != EB_EXPIRE_TIME_INVALID; hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != C_ERR) { - if (count == 0) { - int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? - AOF_REWRITE_ITEMS_PER_CMD : items; - if (!rioWriteBulkCount(r,'*',2+cmd_items*2) || - !rioWriteBulkString(r,"HMSET",5) || - !rioWriteBulkObject(r,key)) - { - hashTypeReleaseIterator(hi); - return 0; + if (!isHFE) { + while (hashTypeNext(hi, 0) != C_ERR) { + if (count == 0) { + int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? + AOF_REWRITE_ITEMS_PER_CMD : items; + if (!rioWriteBulkCount(r, '*', 2 + cmd_items * 2) || + !rioWriteBulkString(r, "HMSET", 5) || + !rioWriteBulkObject(r, key)) + goto reHashEnd; } - } - if (!rioWriteHashIteratorCursor(r, hi, OBJ_HASH_KEY) || - !rioWriteHashIteratorCursor(r, hi, OBJ_HASH_VALUE)) - { - hashTypeReleaseIterator(hi); - return 0; + if (!rioWriteHashIteratorCursor(r, hi, OBJ_HASH_KEY) || + !rioWriteHashIteratorCursor(r, hi, OBJ_HASH_VALUE)) + goto reHashEnd; + + if (++count == AOF_REWRITE_ITEMS_PER_CMD) count = 0; + items--; + } + } else { + while (hashTypeNext(hi, 0) != C_ERR) { + + char hmsetCmd[] = "*4\r\n$5\r\nHMSET\r\n"; + if ( (!rioWrite(r, hmsetCmd, sizeof(hmsetCmd) - 1)) || + (!rioWriteBulkObject(r, key)) || + (!rioWriteHashIteratorCursor(r, hi, OBJ_HASH_KEY)) || + (!rioWriteHashIteratorCursor(r, hi, OBJ_HASH_VALUE)) ) + goto reHashEnd; + + if (hi->expire_time != EB_EXPIRE_TIME_INVALID) { + char cmd[] = "*6\r\n$10\r\nHPEXPIREAT\r\n"; + if ( (!rioWrite(r, cmd, sizeof(cmd) - 1)) || + (!rioWriteBulkObject(r, key)) || + (!rioWriteBulkLongLong(r, hi->expire_time)) || + (!rioWriteBulkString(r, "FIELDS", 6)) || + (!rioWriteBulkString(r, "1", 1)) || + (!rioWriteHashIteratorCursor(r, hi, OBJ_HASH_KEY)) ) + goto reHashEnd; + } } - if (++count == AOF_REWRITE_ITEMS_PER_CMD) count = 0; - items--; } - hashTypeReleaseIterator(hi); + res = 1; /* success */ - return 1; +reHashEnd: + hashTypeReleaseIterator(hi); + return res; } /* Helper for rewriteStreamObject() that generates a bulk string into the @@ -2245,11 +2247,11 @@ static int rewriteFunctions(rio *aof) { } int rewriteAppendOnlyFileRio(rio *aof) { - dictIterator *di = NULL; dictEntry *de; int j; long key_count = 0; long long updated_time = 0; + kvstoreIterator *kvs_it = NULL; /* Record timestamp at the beginning of rewriting AOF. */ if (server.aof_timestamp_enabled) { @@ -2262,17 +2264,16 @@ int rewriteAppendOnlyFileRio(rio *aof) { for (j = 0; j < server.dbnum; j++) { char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; - redisDb *db = server.db+j; - dict *d = db->dict; - if (dictSize(d) == 0) continue; - di = dictGetSafeIterator(d); + redisDb *db = server.db + j; + if (kvstoreSize(db->keys) == 0) continue; /* SELECT the new DB */ if (rioWrite(aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr; if (rioWriteBulkLongLong(aof,j) == 0) goto werr; + kvs_it = kvstoreIteratorInit(db->keys); /* Iterate this DB writing every entry */ - while((de = dictNext(di)) != NULL) { + while((de = kvstoreIteratorNext(kvs_it)) != NULL) { sds keystr; robj key, *o; long long expiretime; @@ -2337,13 +2338,12 @@ int rewriteAppendOnlyFileRio(rio *aof) { if (server.rdb_key_save_delay) debugDelay(server.rdb_key_save_delay); } - dictReleaseIterator(di); - di = NULL; + kvstoreIteratorRelease(kvs_it); } return C_OK; werr: - if (di) dictReleaseIterator(di); + if (kvs_it) kvstoreIteratorRelease(kvs_it); return C_ERR; } @@ -2454,7 +2454,23 @@ int rewriteAppendOnlyFileBackground(void) { server.aof_lastbgrewrite_status = C_ERR; return C_ERR; } + + if (server.aof_state == AOF_WAIT_REWRITE) { + /* Wait for all bio jobs related to AOF to drain. This prevents a race + * between updates to `fsynced_reploff_pending` of the worker thread, belonging + * to the previous AOF, and the new one. This concern is specific for a full + * sync scenario where we don't wanna risk the ACKed replication offset + * jumping backwards or forward when switching to a different master. */ + bioDrainWorker(BIO_AOF_FSYNC); + + /* Set the initial repl_offset, which will be applied to fsynced_reploff + * when AOFRW finishes (after possibly being updated by a bio thread) */ + atomicSet(server.fsynced_reploff_pending, server.master_repl_offset); + server.fsynced_reploff = 0; + } + server.stat_aof_rewrites++; + if ((childpid = redisFork(CHILD_TYPE_AOF)) == 0) { char tmpfile[256]; diff --git a/src/asciilogo.h b/src/asciilogo.h index a62f68cf94c..d1778edf329 100644 --- a/src/asciilogo.h +++ b/src/asciilogo.h @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ const char *ascii_logo = diff --git a/src/atomicvar.h b/src/atomicvar.h index 2c2969c33b9..b8529ba271b 100644 --- a/src/atomicvar.h +++ b/src/atomicvar.h @@ -1,16 +1,41 @@ /* This file implements atomic counters using c11 _Atomic, __atomic or __sync * macros if available, otherwise we will throw an error when compile. * - * The exported interface is composed of three macros: + * The exported interface is composed of the following macros: * * atomicIncr(var,count) -- Increment the atomic counter * atomicGetIncr(var,oldvalue_var,count) -- Get and increment the atomic counter + * atomicIncrGet(var,newvalue_var,count) -- Increment and get the atomic counter new value * atomicDecr(var,count) -- Decrement the atomic counter * atomicGet(var,dstvar) -- Fetch the atomic counter value * atomicSet(var,value) -- Set the atomic counter value * atomicGetWithSync(var,value) -- 'atomicGet' with inter-thread synchronization * atomicSetWithSync(var,value) -- 'atomicSet' with inter-thread synchronization - * + * + * Atomic operations on flags. + * Flag type can be int, long, long long or their unsigned counterparts. + * The value of the flag can be 1 or 0. + * + * atomicFlagGetSet(var,oldvalue_var) -- Get and set the atomic counter value + * + * NOTE1: __atomic* and _Atomic implementations can be actually elaborated to support any value by changing the + * hardcoded new value passed to __atomic_exchange* from 1 to @param count + * i.e oldvalue_var = atomic_exchange_explicit(&var, count). + * However, in order to be compatible with the __sync functions family, we can use only 0 and 1. + * The only exchange alternative suggested by __sync is __sync_lock_test_and_set, + * But as described by the gnu manual for __sync_lock_test_and_set(): + * https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html + * "A target may support reduced functionality here by which the only valid value to store is the immediate constant 1. The exact value + * actually stored in *ptr is implementation defined." + * Hence, we can't rely on it for a any value other than 1. + * We eventually chose to implement this method with __sync_val_compare_and_swap since it satisfies functionality needed for atomicFlagGetSet + * (if the flag was 0 -> set to 1, if it's already 1 -> do nothing, but the final result is that the flag is set), + * and also it has a full barrier (__sync_lock_test_and_set has acquire barrier). + * + * NOTE2: Unlike other atomic type, which aren't guaranteed to be lock free, c11 atmoic_flag does. + * To check whether a type is lock free, atomic_is_lock_free() can be used. + * It can be considered to limit the flag type to atomic_flag to improve performance. + * * Never use return value from the macros, instead use the AtomicGetIncr() * if you need to get the current value and increment it atomically, like * in the following example: @@ -21,32 +46,11 @@ * * ---------------------------------------------------------------------------- * - * Copyright (c) 2015, Salvatore Sanfilippo + * Copyright (c) 2015-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include @@ -93,6 +97,8 @@ #define atomicGetIncr(var,oldvalue_var,count) do { \ oldvalue_var = atomic_fetch_add_explicit(&var,(count),memory_order_relaxed); \ } while(0) +#define atomicIncrGet(var, newvalue_var, count) \ + newvalue_var = atomicIncr(var,count) + count #define atomicDecr(var,count) atomic_fetch_sub_explicit(&var,(count),memory_order_relaxed) #define atomicGet(var,dstvar) do { \ dstvar = atomic_load_explicit(&var,memory_order_relaxed); \ @@ -103,6 +109,8 @@ } while(0) #define atomicSetWithSync(var,value) \ atomic_store_explicit(&var,value,memory_order_seq_cst) +#define atomicFlagGetSet(var,oldvalue_var) \ + oldvalue_var = atomic_exchange_explicit(&var,1,memory_order_relaxed) #define REDIS_ATOMIC_API "c11-builtin" #elif !defined(__ATOMIC_VAR_FORCE_SYNC_MACROS) && \ @@ -111,6 +119,8 @@ /* Implementation using __atomic macros. */ #define atomicIncr(var,count) __atomic_add_fetch(&var,(count),__ATOMIC_RELAXED) +#define atomicIncrGet(var, newvalue_var, count) \ + newvalue_var = __atomic_add_fetch(&var,(count),__ATOMIC_RELAXED) #define atomicGetIncr(var,oldvalue_var,count) do { \ oldvalue_var = __atomic_fetch_add(&var,(count),__ATOMIC_RELAXED); \ } while(0) @@ -124,12 +134,16 @@ } while(0) #define atomicSetWithSync(var,value) \ __atomic_store_n(&var,value,__ATOMIC_SEQ_CST) +#define atomicFlagGetSet(var,oldvalue_var) \ + oldvalue_var = __atomic_exchange_n(&var,1,__ATOMIC_RELAXED) #define REDIS_ATOMIC_API "atomic-builtin" #elif defined(HAVE_ATOMIC) /* Implementation using __sync macros. */ #define atomicIncr(var,count) __sync_add_and_fetch(&var,(count)) +#define atomicIncrGet(var, newvalue_var, count) \ + newvalue_var = __sync_add_and_fetch(&var,(count)) #define atomicGetIncr(var,oldvalue_var,count) do { \ oldvalue_var = __sync_fetch_and_add(&var,(count)); \ } while(0) @@ -149,6 +163,8 @@ ANNOTATE_HAPPENS_BEFORE(&var); \ while(!__sync_bool_compare_and_swap(&var,var,value,__sync_synchronize)); \ } while(0) +#define atomicFlagGetSet(var,oldvalue_var) \ + oldvalue_var = __sync_val_compare_and_swap(&var,0,1) #define REDIS_ATOMIC_API "sync-builtin" #else diff --git a/src/bio.c b/src/bio.c index 10ecf8db294..6f96ef709ee 100644 --- a/src/bio.c +++ b/src/bio.c @@ -1,16 +1,16 @@ /* Background I/O service for Redis. * * This file implements operations that we need to perform in the background. - * Currently there is only a single operation, that is a background close(2) - * system call. This is needed as when the process is the last owner of a - * reference to a file closing it means unlinking it, and the deletion of the - * file is slow, blocking the server. + * Currently there are 3 operations: + * 1) a background close(2) system call. This is needed when the process is + * the last owner of a reference to a file closing it means unlinking it, and + * the deletion of the file is slow, blocking the server. + * 2) AOF fsync + * 3) lazyfree of memory * * In the future we'll either continue implementing new things we need or * we'll switch to libeio. However there are probably long term uses for this - * file as we may want to put here Redis specific background tasks (for instance - * it is not impossible that we'll need a non blocking FLUSHDB/FLUSHALL - * implementation). + * file as we may want to put here Redis specific background tasks. * * DESIGN * ------ @@ -26,42 +26,26 @@ * least-recently-inserted to the most-recently-inserted (older jobs processed * first). * - * Currently there is no way for the creator of the job to be notified about - * the completion of the operation, this will only be added when/if needed. + * To let the creator of the job to be notified about the completion of the + * operation, it will need to submit additional dummy job, coined as + * completion job request that will be written back eventually, by the + * background thread, into completion job response queue. This notification + * layout can simplify flows that might submit more than one job, such as + * in case of FLUSHALL which for a single command submits multiple jobs. It + * is also correct because jobs are processed in FIFO fashion. * * ---------------------------------------------------------------------------- * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ - #include "server.h" #include "bio.h" +#include static char* bio_worker_title[] = { "bio_close_file", @@ -76,6 +60,9 @@ static unsigned int bio_job_to_worker[] = { [BIO_AOF_FSYNC] = 1, [BIO_CLOSE_AOF] = 1, [BIO_LAZY_FREE] = 2, + [BIO_COMP_RQ_CLOSE_FILE] = 0, + [BIO_COMP_RQ_AOF_FSYNC] = 1, + [BIO_COMP_RQ_LAZY_FREE] = 2 }; static pthread_t bio_threads[BIO_WORKER_NUM]; @@ -84,6 +71,18 @@ static pthread_cond_t bio_newjob_cond[BIO_WORKER_NUM]; static list *bio_jobs[BIO_WORKER_NUM]; static unsigned long bio_jobs_counter[BIO_NUM_OPS] = {0}; +/* The bio_comp_list is used to hold completion job responses and to handover + * to main thread to callback as notification for job completion. Main + * thread will be triggered to read the list by signaling via writing to a pipe */ +static list *bio_comp_list; +static pthread_mutex_t bio_mutex_comp; +static int job_comp_pipe[2]; /* Pipe used to awake the event loop */ + +typedef struct bio_comp_item { + comp_fn *func; /* callback after completion job will be processed */ + uint64_t arg; /* user data to be passed to the function */ +} bio_comp_item; + /* This structure represents a background Job. It is only used locally to this * file as the API does not expose the internals at all. */ typedef union bio_job { @@ -107,9 +106,15 @@ typedef union bio_job { lazy_free_fn *free_fn; /* Function that will free the provided arguments */ void *free_args[]; /* List of arguments to be passed to the free function */ } free_args; + struct { + int type; /* header */ + comp_fn *fn; /* callback. Handover to main thread to cb as notify for job completion */ + uint64_t arg; /* callback arguments */ + } comp_rq; } bio_job; void *bioProcessBackgroundJobs(void *arg); +void bioPipeReadJobCompList(aeEventLoop *el, int fd, void *privdata, int mask); /* Make sure we have enough stack to perform all the things we do in the * main thread. */ @@ -129,6 +134,27 @@ void bioInit(void) { bio_jobs[j] = listCreate(); } + /* init jobs comp responses */ + bio_comp_list = listCreate(); + pthread_mutex_init(&bio_mutex_comp, NULL); + + /* Create a pipe for background thread to be able to wake up the redis main thread. + * Make the pipe non blocking. This is just a best effort aware mechanism + * and we do not want to block not in the read nor in the write half. + * Enable close-on-exec flag on pipes in case of the fork-exec system calls in + * sentinels or redis servers. */ + if (anetPipe(job_comp_pipe, O_CLOEXEC|O_NONBLOCK, O_CLOEXEC|O_NONBLOCK) == -1) { + serverLog(LL_WARNING, + "Can't create the pipe for bio thread: %s", strerror(errno)); + exit(1); + } + + /* Register a readable event for the pipe used to awake the event loop on job completion */ + if (aeCreateFileEvent(server.el, job_comp_pipe[0], AE_READABLE, + bioPipeReadJobCompList, NULL) == AE_ERR) { + serverPanic("Error registering the readable event for the bio pipe."); + } + /* Set the stack size as by default it may be small in some system */ pthread_attr_init(&attr); pthread_attr_getstacksize(&attr,&stacksize); @@ -174,6 +200,28 @@ void bioCreateLazyFreeJob(lazy_free_fn free_fn, int arg_count, ...) { bioSubmitJob(BIO_LAZY_FREE, job); } +void bioCreateCompRq(bio_worker_t assigned_worker, comp_fn *func, uint64_t user_data) { + int type; + switch (assigned_worker) { + case BIO_WORKER_CLOSE_FILE: + type = BIO_COMP_RQ_CLOSE_FILE; + break; + case BIO_WORKER_AOF_FSYNC: + type = BIO_COMP_RQ_AOF_FSYNC; + break; + case BIO_WORKER_LAZY_FREE: + type = BIO_COMP_RQ_LAZY_FREE; + break; + default: + serverPanic("Invalid worker type in bioCreateCompRq()."); + } + + bio_job *job = zmalloc(sizeof(*job)); + job->comp_rq.fn = func; + job->comp_rq.arg = user_data; + bioSubmitJob(type, job); +} + void bioCreateCloseJob(int fd, int need_fsync, int need_reclaim_cache) { bio_job *job = zmalloc(sizeof(*job)); job->fd_args.fd = fd; @@ -285,6 +333,21 @@ void *bioProcessBackgroundJobs(void *arg) { close(job->fd_args.fd); } else if (job_type == BIO_LAZY_FREE) { job->free_args.free_fn(job->free_args.free_args); + } else if ((job_type == BIO_COMP_RQ_CLOSE_FILE) || + (job_type == BIO_COMP_RQ_AOF_FSYNC) || + (job_type == BIO_COMP_RQ_LAZY_FREE)) { + bio_comp_item *comp_rsp = zmalloc(sizeof(bio_comp_item)); + comp_rsp->func = job->comp_rq.fn; + comp_rsp->arg = job->comp_rq.arg; + + /* just write it to completion job responses */ + pthread_mutex_lock(&bio_mutex_comp); + listAddNodeTail(bio_comp_list, comp_rsp); + pthread_mutex_unlock(&bio_mutex_comp); + + if (write(job_comp_pipe[1],"A",1) != 1) { + /* Pipe is non-blocking, write() may fail if it's full. */ + } } else { serverPanic("Wrong job type in bioProcessBackgroundJobs()."); } @@ -343,3 +406,34 @@ void bioKillThreads(void) { } } } + +void bioPipeReadJobCompList(aeEventLoop *el, int fd, void *privdata, int mask) { + UNUSED(el); + UNUSED(mask); + UNUSED(privdata); + + char buf[128]; + list *tmp_list = NULL; + + while (read(fd, buf, sizeof(buf)) == sizeof(buf)); + + /* Handle event loop events if pipe was written from event loop API */ + pthread_mutex_lock(&bio_mutex_comp); + if (listLength(bio_comp_list)) { + tmp_list = bio_comp_list; + bio_comp_list = listCreate(); + } + pthread_mutex_unlock(&bio_mutex_comp); + + if (!tmp_list) return; + + /* callback to all job completions */ + while (listLength(tmp_list)) { + listNode *ln = listFirst(tmp_list); + bio_comp_item *rsp = ln->value; + listDelNode(tmp_list, ln); + rsp->func(rsp->arg); + zfree(rsp); + } + listRelease(tmp_list); +} diff --git a/src/bio.h b/src/bio.h index 0d1fe9b4b9f..2679a2bf550 100644 --- a/src/bio.h +++ b/src/bio.h @@ -1,36 +1,35 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __BIO_H #define __BIO_H typedef void lazy_free_fn(void *args[]); +typedef void comp_fn(uint64_t user_data); + +typedef enum bio_worker_t { + BIO_WORKER_CLOSE_FILE = 0, + BIO_WORKER_AOF_FSYNC, + BIO_WORKER_LAZY_FREE, + BIO_WORKER_NUM +} bio_worker_t; + +/* Background job opcodes */ +typedef enum bio_job_type_t { + BIO_CLOSE_FILE = 0, /* Deferred close(2) syscall. */ + BIO_AOF_FSYNC, /* Deferred AOF fsync. */ + BIO_LAZY_FREE, /* Deferred objects freeing. */ + BIO_CLOSE_AOF, + BIO_COMP_RQ_CLOSE_FILE, /* Job completion request, registered on close-file worker's queue */ + BIO_COMP_RQ_AOF_FSYNC, /* Job completion request, registered on aof-fsync worker's queue */ + BIO_COMP_RQ_LAZY_FREE, /* Job completion request, registered on lazy-free worker's queue */ + BIO_NUM_OPS +} bio_job_type_t; /* Exported API */ void bioInit(void); @@ -41,14 +40,7 @@ void bioCreateCloseJob(int fd, int need_fsync, int need_reclaim_cache); void bioCreateCloseAofJob(int fd, long long offset, int need_reclaim_cache); void bioCreateFsyncJob(int fd, long long offset, int need_reclaim_cache); void bioCreateLazyFreeJob(lazy_free_fn free_fn, int arg_count, ...); +void bioCreateCompRq(bio_worker_t assigned_worker, comp_fn *func, uint64_t user_data); -/* Background job opcodes */ -enum { - BIO_CLOSE_FILE = 0, /* Deferred close(2) syscall. */ - BIO_AOF_FSYNC, /* Deferred AOF fsync. */ - BIO_LAZY_FREE, /* Deferred objects freeing. */ - BIO_CLOSE_AOF, /* Deferred close for AOF files. */ - BIO_NUM_OPS -}; #endif diff --git a/src/bitops.c b/src/bitops.c index 23d80554e44..c0388a15d6c 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -1,31 +1,10 @@ /* Bit operations. * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -802,25 +781,12 @@ void bitcountCommand(client *c) { int isbit = 0; unsigned char first_byte_neg_mask = 0, last_byte_neg_mask = 0; - /* Lookup, check for type, and return 0 for non existing keys. */ - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,o,OBJ_STRING)) return; - p = getObjectReadOnlyString(o,&strlen,llbuf); - /* Parse start/end range if any. */ if (c->argc == 4 || c->argc == 5) { - long long totlen = strlen; - /* Make sure we will not overflow */ - serverAssert(totlen <= LLONG_MAX >> 3); if (getLongLongFromObjectOrReply(c,c->argv[2],&start,NULL) != C_OK) return; if (getLongLongFromObjectOrReply(c,c->argv[3],&end,NULL) != C_OK) return; - /* Convert negative indexes */ - if (start < 0 && end < 0 && start > end) { - addReply(c,shared.czero); - return; - } if (c->argc == 5) { if (!strcasecmp(c->argv[4]->ptr,"bit")) isbit = 1; else if (!strcasecmp(c->argv[4]->ptr,"byte")) isbit = 0; @@ -829,6 +795,20 @@ void bitcountCommand(client *c) { return; } } + /* Lookup, check for type. */ + o = lookupKeyRead(c->db, c->argv[1]); + if (checkType(c, o, OBJ_STRING)) return; + p = getObjectReadOnlyString(o,&strlen,llbuf); + long long totlen = strlen; + + /* Make sure we will not overflow */ + serverAssert(totlen <= LLONG_MAX >> 3); + + /* Convert negative indexes */ + if (start < 0 && end < 0 && start > end) { + addReply(c,shared.czero); + return; + } if (isbit) totlen <<= 3; if (start < 0) start = totlen+start; if (end < 0) end = totlen+end; @@ -844,6 +824,10 @@ void bitcountCommand(client *c) { end >>= 3; } } else if (c->argc == 2) { + /* Lookup, check for type. */ + o = lookupKeyRead(c->db, c->argv[1]); + if (checkType(c, o, OBJ_STRING)) return; + p = getObjectReadOnlyString(o,&strlen,llbuf); /* The whole string. */ start = 0; end = strlen-1; @@ -853,6 +837,12 @@ void bitcountCommand(client *c) { return; } + /* Return 0 for non existing keys. */ + if (o == NULL) { + addReply(c, shared.czero); + return; + } + /* Precondition: end >= 0 && end < strlen, so the only condition where * zero can be returned is: start > end. */ if (start > end) { @@ -892,21 +882,8 @@ void bitposCommand(client *c) { return; } - /* If the key does not exist, from our point of view it is an infinite - * array of 0 bits. If the user is looking for the first clear bit return 0, - * If the user is looking for the first set bit, return -1. */ - if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) { - addReplyLongLong(c, bit ? -1 : 0); - return; - } - if (checkType(c,o,OBJ_STRING)) return; - p = getObjectReadOnlyString(o,&strlen,llbuf); - /* Parse start/end range if any. */ if (c->argc == 4 || c->argc == 5 || c->argc == 6) { - long long totlen = strlen; - /* Make sure we will not overflow */ - serverAssert(totlen <= LLONG_MAX >> 3); if (getLongLongFromObjectOrReply(c,c->argv[3],&start,NULL) != C_OK) return; if (c->argc == 6) { @@ -921,10 +898,22 @@ void bitposCommand(client *c) { if (getLongLongFromObjectOrReply(c,c->argv[4],&end,NULL) != C_OK) return; end_given = 1; - } else { + } + + /* Lookup, check for type. */ + o = lookupKeyRead(c->db, c->argv[1]); + if (checkType(c, o, OBJ_STRING)) return; + p = getObjectReadOnlyString(o, &strlen, llbuf); + + /* Make sure we will not overflow */ + long long totlen = strlen; + serverAssert(totlen <= LLONG_MAX >> 3); + + if (c->argc < 5) { if (isbit) end = (totlen<<3) + 7; else end = totlen-1; } + if (isbit) totlen <<= 3; /* Convert negative indexes */ if (start < 0) start = totlen+start; @@ -941,6 +930,11 @@ void bitposCommand(client *c) { end >>= 3; } } else if (c->argc == 3) { + /* Lookup, check for type. */ + o = lookupKeyRead(c->db, c->argv[1]); + if (checkType(c,o,OBJ_STRING)) return; + p = getObjectReadOnlyString(o,&strlen,llbuf); + /* The whole string. */ start = 0; end = strlen-1; @@ -950,6 +944,14 @@ void bitposCommand(client *c) { return; } + /* If the key does not exist, from our point of view it is an infinite + * array of 0 bits. If the user is looking for the first clear bit return 0, + * If the user is looking for the first set bit, return -1. */ + if (o == NULL) { + addReplyLongLong(c, bit ? -1 : 0); + return; + } + /* For empty ranges (start > end) we return -1 as an empty range does * not contain a 0 nor a 1. */ if (start > end) { diff --git a/src/blocked.c b/src/blocked.c index 6ad4667dba5..009e2557b31 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -1,31 +1,10 @@ /* blocked.c - generic support for blocking operations like BLPOP & WAIT. * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). * * --------------------------------------------------------------------------- * @@ -89,6 +68,7 @@ void blockClient(client *c, int btype) { /* Master client should never be blocked unless pause or module */ serverAssert(!(c->flags & CLIENT_MASTER && btype != BLOCKED_MODULE && + btype != BLOCKED_LAZYFREE && btype != BLOCKED_POSTPONE)); c->flags |= CLIENT_BLOCKED; @@ -196,6 +176,8 @@ void unblockClient(client *c, int queue_for_reprocessing) { c->postponed_list_node = NULL; } else if (c->bstate.btype == BLOCKED_SHUTDOWN) { /* No special cleanup. */ + } else if (c->bstate.btype == BLOCKED_LAZYFREE) { + /* No special cleanup. */ } else { serverPanic("Unknown btype in unblockClient()."); } @@ -227,7 +209,9 @@ void unblockClient(client *c, int queue_for_reprocessing) { * send it a reply of some kind. After this function is called, * unblockClient() will be called with the same client as argument. */ void replyToBlockedClientTimedOut(client *c) { - if (c->bstate.btype == BLOCKED_LIST || + if (c->bstate.btype == BLOCKED_LAZYFREE) { + addReply(c, shared.ok); /* No reason lazy-free to fail */ + } else if (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET || c->bstate.btype == BLOCKED_STREAM) { addReplyNullArray(c); @@ -239,7 +223,7 @@ void replyToBlockedClientTimedOut(client *c) { addReplyLongLong(c,server.fsynced_reploff >= c->bstate.reploffset); addReplyLongLong(c,replicationCountAOFAcksByOffset(c->bstate.reploffset)); } else if (c->bstate.btype == BLOCKED_MODULE) { - moduleBlockedClientTimedOut(c); + moduleBlockedClientTimedOut(c, 0); } else { serverPanic("Unknown btype in replyToBlockedClientTimedOut()."); } @@ -284,9 +268,16 @@ void disconnectAllBlockedClients(void) { if (c->bstate.btype == BLOCKED_POSTPONE) continue; - unblockClientOnError(c, - "-UNBLOCKED force unblock from blocking operation, " - "instance state changed (master -> replica?)"); + if (c->bstate.btype == BLOCKED_LAZYFREE) { + addReply(c, shared.ok); /* No reason lazy-free to fail */ + c->flags &= ~CLIENT_PENDING_COMMAND; + unblockClient(c, 1); + } else { + + unblockClientOnError(c, + "-UNBLOCKED force unblock from blocking operation, " + "instance state changed (master -> replica?)"); + } c->flags |= CLIENT_CLOSE_AFTER_REPLY; } } @@ -370,7 +361,12 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo list *l; int j; - c->bstate.timeout = timeout; + if (!(c->flags & CLIENT_REPROCESSING_COMMAND)) { + /* If the client is re-processing the command, we do not set the timeout + * because we need to retain the client's original timeout. */ + c->bstate.timeout = timeout; + } + for (j = 0; j < numkeys; j++) { /* If the key already exists in the dictionary ignore it. */ if (!(client_blocked_entry = dictAddRaw(c->bstate.keys,keys[j],NULL))) { @@ -392,7 +388,6 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo listAddNodeTail(l,c); dictSetVal(c->bstate.keys,client_blocked_entry,listLast(l)); - /* We need to add the key to blocking_keys_unblock_on_nokey, if the client * wants to be awakened if key is deleted (like XREADGROUP) */ if (unblock_on_nokey) { @@ -703,6 +698,9 @@ static void moduleUnblockClientOnKey(client *c, robj *key) { * we want to remove the pending flag to indicate we already responded to the * command with timeout reply. */ void unblockClientOnTimeout(client *c) { + /* The client has been unlocked (in the moduleUnblocked list), return ASAP. */ + if (c->bstate.btype == BLOCKED_MODULE && isModuleClientUnblocked(c)) return; + replyToBlockedClientTimedOut(c); if (c->flags & CLIENT_PENDING_COMMAND) c->flags &= ~CLIENT_PENDING_COMMAND; @@ -720,21 +718,6 @@ void unblockClientOnError(client *c, const char *err_str) { unblockClient(c, 1); } -/* sets blocking_keys to the total number of keys which has at least one client blocked on them - * sets blocking_keys_on_nokey to the total number of keys which has at least one client - * blocked on them to be written or deleted */ -void totalNumberOfBlockingKeys(unsigned long *blocking_keys, unsigned long *bloking_keys_on_nokey) { - unsigned long bkeys=0, bkeys_on_nokey=0; - for (int j = 0; j < server.dbnum; j++) { - bkeys += dictSize(server.db[j].blocking_keys); - bkeys_on_nokey += dictSize(server.db[j].blocking_keys_unblock_on_nokey); - } - if (blocking_keys) - *blocking_keys = bkeys; - if (bloking_keys_on_nokey) - *bloking_keys_on_nokey = bkeys_on_nokey; -} - void blockedBeforeSleep(void) { /* Handle precise timeouts of blocked clients. */ handleBlockedClientsTimeout(); diff --git a/src/call_reply.c b/src/call_reply.c index ccd1b36d457..b246361afce 100644 --- a/src/call_reply.c +++ b/src/call_reply.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2021, Redis Labs Ltd. + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" diff --git a/src/call_reply.h b/src/call_reply.h index 657f24735ce..fc7013ea0ac 100644 --- a/src/call_reply.h +++ b/src/call_reply.h @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2021, Redis Labs Ltd. + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef SRC_CALL_REPLY_H_ diff --git a/src/childinfo.c b/src/childinfo.c index 1303dd04384..eb98d469a89 100644 --- a/src/childinfo.c +++ b/src/childinfo.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2016, Salvatore Sanfilippo + * Copyright (c) 2016-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" diff --git a/src/cli_common.c b/src/cli_common.c index 421e7d34a39..65372661e8f 100644 --- a/src/cli_common.c +++ b/src/cli_common.c @@ -1,35 +1,16 @@ /* CLI (command line interface) common methods * - * Copyright (c) 2020, Redis Labs + * Copyright (c) 2020-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "fmacros.h" #include "cli_common.h" +#include "version.h" + #include #include #include @@ -48,6 +29,9 @@ #define UNUSED(V) ((void) V) +char *redisGitSHA1(void); +char *redisGitDirty(void); + /* Wrapper around redisSecureConnection to avoid hiredis_ssl dependencies if * not building with TLS support. */ @@ -406,3 +390,34 @@ sds escapeJsonString(sds s, const char *p, size_t len) { } return sdscatlen(s,"\"",1); } + +sds cliVersion(void) { + sds version = sdscatprintf(sdsempty(), "%s", REDIS_VERSION); + + /* Add git commit and working tree status when available. */ + if (strtoll(redisGitSHA1(),NULL,16)) { + version = sdscatprintf(version, " (git:%s", redisGitSHA1()); + if (strtoll(redisGitDirty(),NULL,10)) + version = sdscatprintf(version, "-dirty"); + version = sdscat(version, ")"); + } + return version; +} + +/* This is a wrapper to call redisConnect or redisConnectWithTimeout. */ +redisContext *redisConnectWrapper(const char *ip, int port, const struct timeval tv) { + if (tv.tv_sec == 0 && tv.tv_usec == 0) { + return redisConnect(ip, port); + } else { + return redisConnectWithTimeout(ip, port, tv); + } +} + +/* This is a wrapper to call redisConnectUnix or redisConnectUnixWithTimeout. */ +redisContext *redisConnectUnixWrapper(const char *path, const struct timeval tv) { + if (tv.tv_sec == 0 && tv.tv_usec == 0) { + return redisConnectUnix(path); + } else { + return redisConnectUnixWithTimeout(path, tv); + } +} diff --git a/src/cli_common.h b/src/cli_common.h index cffdee61d89..a5b8e44a28c 100644 --- a/src/cli_common.h +++ b/src/cli_common.h @@ -51,4 +51,9 @@ void freeCliConnInfo(cliConnInfo connInfo); sds escapeJsonString(sds s, const char *p, size_t len); +sds cliVersion(void); + +redisContext *redisConnectWrapper(const char *ip, int port, const struct timeval tv); +redisContext *redisConnectUnixWrapper(const char *path, const struct timeval tv); + #endif /* __CLICOMMON_H */ diff --git a/src/cluster.c b/src/cluster.c index a390585f3e6..d09a455b3a6 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1,6479 +1,91 @@ -/* Redis Cluster implementation. - * - * Copyright (c) 2009-2012, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "server.h" -#include "cluster.h" -#include "endianconv.h" -#include "connection.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* A global reference to myself is handy to make code more clear. - * Myself always points to server.cluster->myself, that is, the clusterNode - * that represents this node. */ -clusterNode *myself = NULL; - -clusterNode *createClusterNode(char *nodename, int flags); -void clusterAddNode(clusterNode *node); -void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); -void clusterReadHandler(connection *conn); -void clusterSendPing(clusterLink *link, int type); -void clusterSendFail(char *nodename); -void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request); -void clusterUpdateState(void); -int clusterNodeGetSlotBit(clusterNode *n, int slot); -list *clusterGetNodesInMyShard(clusterNode *node); -int clusterNodeAddSlave(clusterNode *master, clusterNode *slave); -int clusterAddSlot(clusterNode *n, int slot); -int clusterDelSlot(int slot); -int clusterDelNodeSlots(clusterNode *node); -int clusterNodeSetSlotBit(clusterNode *n, int slot); -void clusterSetMaster(clusterNode *n); -void clusterHandleSlaveFailover(void); -void clusterHandleSlaveMigration(int max_slaves); -int bitmapTestBit(unsigned char *bitmap, int pos); -void bitmapSetBit(unsigned char *bitmap, int pos); -void bitmapClearBit(unsigned char *bitmap, int pos); -void clusterDoBeforeSleep(int flags); -void clusterSendUpdate(clusterLink *link, clusterNode *node); -void resetManualFailover(void); -void clusterCloseAllSlots(void); -void clusterSetNodeAsMaster(clusterNode *n); -void clusterDelNode(clusterNode *delnode); -sds representClusterNodeFlags(sds ci, uint16_t flags); -sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count); -void clusterFreeNodesSlotsInfo(clusterNode *n); -uint64_t clusterGetMaxEpoch(void); -int clusterBumpConfigEpochWithoutConsensus(void); -void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len); -const char *clusterGetMessageTypeString(int type); -void removeChannelsInSlot(unsigned int slot); -unsigned int countKeysInSlot(unsigned int hashslot); -unsigned int countChannelsInSlot(unsigned int hashslot); -unsigned int delKeysInSlot(unsigned int hashslot); -void clusterAddNodeToShard(const char *shard_id, clusterNode *node); -list *clusterLookupNodeListByShardId(const char *shard_id); -void clusterRemoveNodeFromShard(clusterNode *node); -int auxShardIdSetter(clusterNode *n, void *value, int length); -sds auxShardIdGetter(clusterNode *n, sds s); -int auxShardIdPresent(clusterNode *n); -int auxHumanNodenameSetter(clusterNode *n, void *value, int length); -sds auxHumanNodenameGetter(clusterNode *n, sds s); -int auxHumanNodenamePresent(clusterNode *n); -int auxTcpPortSetter(clusterNode *n, void *value, int length); -sds auxTcpPortGetter(clusterNode *n, sds s); -int auxTcpPortPresent(clusterNode *n); -int auxTlsPortSetter(clusterNode *n, void *value, int length); -sds auxTlsPortGetter(clusterNode *n, sds s); -int auxTlsPortPresent(clusterNode *n); -static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen); - -int getNodeDefaultClientPort(clusterNode *n) { - return server.tls_cluster ? n->tls_port : n->tcp_port; -} - -static inline int getNodeDefaultReplicationPort(clusterNode *n) { - return server.tls_replication ? n->tls_port : n->tcp_port; -} - -static inline int getNodeClientPort(clusterNode *n, int use_tls) { - return use_tls ? n->tls_port : n->tcp_port; -} - -static inline int defaultClientPort(void) { - return server.tls_cluster ? server.tls_port : server.port; -} - -/* Links to the next and previous entries for keys in the same slot are stored - * in the dict entry metadata. See Slot to Key API below. */ -#define dictEntryNextInSlot(de) \ - (((clusterDictEntryMetadata *)dictEntryMetadata(de))->next) -#define dictEntryPrevInSlot(de) \ - (((clusterDictEntryMetadata *)dictEntryMetadata(de))->prev) - -#define isSlotUnclaimed(slot) \ - (server.cluster->slots[slot] == NULL || \ - bitmapTestBit(server.cluster->owner_not_claiming_slot, slot)) - -#define RCVBUF_INIT_LEN 1024 -#define RCVBUF_MAX_PREALLOC (1<<20) /* 1MB */ - -/* Cluster nodes hash table, mapping nodes addresses 1.2.3.4:6379 to - * clusterNode structures. */ -dictType clusterNodesDictType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictSdsKeyCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - NULL, /* val destructor */ - NULL /* allow to expand */ -}; - -/* Cluster re-addition blacklist. This maps node IDs to the time - * we can re-add this node. The goal is to avoid reading a removed - * node for some time. */ -dictType clusterNodesBlackListDictType = { - dictSdsCaseHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictSdsKeyCaseCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - NULL, /* val destructor */ - NULL /* allow to expand */ -}; - -static ConnectionType *connTypeOfCluster(void) { - if (server.tls_cluster) { - return connectionTypeTls(); - } - - return connectionTypeTcp(); -} -/* Cluster shards hash table, mapping shard id to list of nodes */ -dictType clusterSdsToListType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictSdsKeyCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - dictListDestructor, /* val destructor */ - NULL /* allow to expand */ -}; - -/* Aux fields are introduced in Redis 7.2 to support the persistence - * of various important node properties, such as shard id, in nodes.conf. - * Aux fields take an explicit format of name=value pairs and have no - * intrinsic order among them. Aux fields are always grouped together - * at the end of the second column of each row after the node's IP - * address/port/cluster_port and the optional hostname. Aux fields - * are separated by ','. */ - -/* Aux field setter function prototype - * return C_OK when the update is successful; C_ERR otherwise */ -typedef int (aux_value_setter) (clusterNode* n, void *value, int length); -/* Aux field getter function prototype - * return an sds that is a concatenation of the input sds string and - * the aux value */ -typedef sds (aux_value_getter) (clusterNode* n, sds s); - -typedef int (aux_value_present) (clusterNode* n); - -typedef struct { - char *field; - aux_value_setter *setter; - aux_value_getter *getter; - aux_value_present *isPresent; -} auxFieldHandler; - -/* Assign index to each aux field */ -typedef enum { - af_shard_id, - af_human_nodename, - af_tcp_port, - af_tls_port, - af_count, -} auxFieldIndex; - -/* Note that - * 1. the order of the elements below must match that of their - * indices as defined in auxFieldIndex - * 2. aux name can contain characters that pass the isValidAuxChar check only */ -auxFieldHandler auxFieldHandlers[] = { - {"shard-id", auxShardIdSetter, auxShardIdGetter, auxShardIdPresent}, - {"nodename", auxHumanNodenameSetter, auxHumanNodenameGetter, auxHumanNodenamePresent}, - {"tcp-port", auxTcpPortSetter, auxTcpPortGetter, auxTcpPortPresent}, - {"tls-port", auxTlsPortSetter, auxTlsPortGetter, auxTlsPortPresent}, -}; - -int isValidAuxChar(int c) { - return isalnum(c) || (strchr("!#$%&()*+:;<>?@[]^{|}~", c) == NULL); -} - -int isValidAuxString(char *s, unsigned int length) { - for (unsigned i = 0; i < length; i++) { - if (!isValidAuxChar(s[i])) return 0; - } - return 1; -} - -int auxShardIdSetter(clusterNode *n, void *value, int length) { - if (verifyClusterNodeId(value, length) == C_ERR) { - return C_ERR; - } - memcpy(n->shard_id, value, CLUSTER_NAMELEN); - /* if n already has replicas, make sure they all agree - * on the shard id */ - for (int i = 0; i < n->numslaves; i++) { - if (memcmp(n->slaves[i]->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) { - return C_ERR; - } - } - clusterAddNodeToShard(value, n); - return C_OK; -} - -sds auxShardIdGetter(clusterNode *n, sds s) { - return sdscatprintf(s, "%.40s", n->shard_id); -} - -int auxShardIdPresent(clusterNode *n) { - return strlen(n->shard_id); -} - -int auxHumanNodenameSetter(clusterNode *n, void *value, int length) { - if (n && !strncmp(value, n->human_nodename, length)) { - return C_OK; - } else if (!n && (length == 0)) { - return C_OK; - } - if (n) { - n->human_nodename = sdscpylen(n->human_nodename, value, length); - } else if (sdslen(n->human_nodename) != 0) { - sdsclear(n->human_nodename); - } else { - return C_ERR; - } - return C_OK; -} - -sds auxHumanNodenameGetter(clusterNode *n, sds s) { - return sdscatprintf(s, "%s", n->human_nodename); -} - -int auxHumanNodenamePresent(clusterNode *n) { - return sdslen(n->human_nodename); -} - -int auxTcpPortSetter(clusterNode *n, void *value, int length) { - if (length > 5 || length < 1) { - return C_ERR; - } - char buf[length + 1]; - memcpy(buf, (char*)value, length); - buf[length] = '\0'; - n->tcp_port = atoi(buf); - return (n->tcp_port < 0 || n->tcp_port >= 65536) ? C_ERR : C_OK; -} - -sds auxTcpPortGetter(clusterNode *n, sds s) { - return sdscatprintf(s, "%d", n->tcp_port); -} - -int auxTcpPortPresent(clusterNode *n) { - return n->tcp_port >= 0 && n->tcp_port < 65536; -} - -int auxTlsPortSetter(clusterNode *n, void *value, int length) { - if (length > 5 || length < 1) { - return C_ERR; - } - char buf[length + 1]; - memcpy(buf, (char*)value, length); - buf[length] = '\0'; - n->tls_port = atoi(buf); - return (n->tls_port < 0 || n->tls_port >= 65536) ? C_ERR : C_OK; -} - -sds auxTlsPortGetter(clusterNode *n, sds s) { - return sdscatprintf(s, "%d", n->tls_port); -} - -int auxTlsPortPresent(clusterNode *n) { - return n->tls_port >= 0 && n->tls_port < 65536; -} - -/* clusterLink send queue blocks */ -typedef struct { - size_t totlen; /* Total length of this block including the message */ - int refcount; /* Number of cluster link send msg queues containing the message */ - clusterMsg msg; -} clusterMsgSendBlock; - -/* ----------------------------------------------------------------------------- - * Initialization - * -------------------------------------------------------------------------- */ - -/* Load the cluster config from 'filename'. - * - * If the file does not exist or is zero-length (this may happen because - * when we lock the nodes.conf file, we create a zero-length one for the - * sake of locking if it does not already exist), C_ERR is returned. - * If the configuration was loaded from the file, C_OK is returned. */ -int clusterLoadConfig(char *filename) { - FILE *fp = fopen(filename,"r"); - struct stat sb; - char *line; - int maxline, j; - - if (fp == NULL) { - if (errno == ENOENT) { - return C_ERR; - } else { - serverLog(LL_WARNING, - "Loading the cluster node config from %s: %s", - filename, strerror(errno)); - exit(1); - } - } - - if (redis_fstat(fileno(fp),&sb) == -1) { - serverLog(LL_WARNING, - "Unable to obtain the cluster node config file stat %s: %s", - filename, strerror(errno)); - exit(1); - } - /* Check if the file is zero-length: if so return C_ERR to signal - * we have to write the config. */ - if (sb.st_size == 0) { - fclose(fp); - return C_ERR; - } - - /* Parse the file. Note that single lines of the cluster config file can - * be really long as they include all the hash slots of the node. - * This means in the worst possible case, half of the Redis slots will be - * present in a single line, possibly in importing or migrating state, so - * together with the node ID of the sender/receiver. - * - * To simplify we allocate 1024+CLUSTER_SLOTS*128 bytes per line. */ - maxline = 1024+CLUSTER_SLOTS*128; - line = zmalloc(maxline); - while(fgets(line,maxline,fp) != NULL) { - int argc, aux_argc; - sds *argv, *aux_argv; - clusterNode *n, *master; - char *p, *s; - - /* Skip blank lines, they can be created either by users manually - * editing nodes.conf or by the config writing process if stopped - * before the truncate() call. */ - if (line[0] == '\n' || line[0] == '\0') continue; - - /* Split the line into arguments for processing. */ - argv = sdssplitargs(line,&argc); - if (argv == NULL) goto fmterr; - - /* Handle the special "vars" line. Don't pretend it is the last - * line even if it actually is when generated by Redis. */ - if (strcasecmp(argv[0],"vars") == 0) { - if (!(argc % 2)) goto fmterr; - for (j = 1; j < argc; j += 2) { - if (strcasecmp(argv[j],"currentEpoch") == 0) { - server.cluster->currentEpoch = - strtoull(argv[j+1],NULL,10); - } else if (strcasecmp(argv[j],"lastVoteEpoch") == 0) { - server.cluster->lastVoteEpoch = - strtoull(argv[j+1],NULL,10); - } else { - serverLog(LL_NOTICE, - "Skipping unknown cluster config variable '%s'", - argv[j]); - } - } - sdsfreesplitres(argv,argc); - continue; - } - - /* Regular config lines have at least eight fields */ - if (argc < 8) { - sdsfreesplitres(argv,argc); - goto fmterr; - } - - /* Create this node if it does not exist */ - if (verifyClusterNodeId(argv[0], sdslen(argv[0])) == C_ERR) { - sdsfreesplitres(argv, argc); - goto fmterr; - } - n = clusterLookupNode(argv[0], sdslen(argv[0])); - if (!n) { - n = createClusterNode(argv[0],0); - clusterAddNode(n); - } - /* Format for the node address and auxiliary argument information: - * ip:port[@cport][,hostname][,aux=val]*] */ - - aux_argv = sdssplitlen(argv[1], sdslen(argv[1]), ",", 1, &aux_argc); - if (aux_argv == NULL) { - sdsfreesplitres(argv,argc); - goto fmterr; - } - - /* Hostname is an optional argument that defines the endpoint - * that can be reported to clients instead of IP. */ - if (aux_argc > 1 && sdslen(aux_argv[1]) > 0) { - n->hostname = sdscpy(n->hostname, aux_argv[1]); - } else if (sdslen(n->hostname) != 0) { - sdsclear(n->hostname); - } - - /* All fields after hostname are auxiliary and they take on - * the format of "aux=val" where both aux and val can contain - * characters that pass the isValidAuxChar check only. The order - * of the aux fields is insignificant. */ - int aux_tcp_port = 0; - int aux_tls_port = 0; - for (int i = 2; i < aux_argc; i++) { - int field_argc; - sds *field_argv; - field_argv = sdssplitlen(aux_argv[i], sdslen(aux_argv[i]), "=", 1, &field_argc); - if (field_argv == NULL || field_argc != 2) { - /* Invalid aux field format */ - if (field_argv != NULL) sdsfreesplitres(field_argv, field_argc); - sdsfreesplitres(argv,argc); - goto fmterr; - } - - /* Validate that both aux and value contain valid characters only */ - for (unsigned j = 0; j < 2; j++) { - if (!isValidAuxString(field_argv[j],sdslen(field_argv[j]))){ - /* Invalid aux field format */ - sdsfreesplitres(field_argv, field_argc); - sdsfreesplitres(argv,argc); - goto fmterr; - } - } - - /* Note that we don't expect lots of aux fields in the foreseeable - * future so a linear search is completely fine. */ - int field_found = 0; - for (unsigned j = 0; j < numElements(auxFieldHandlers); j++) { - if (sdslen(field_argv[0]) != strlen(auxFieldHandlers[j].field) || - memcmp(field_argv[0], auxFieldHandlers[j].field, sdslen(field_argv[0])) != 0) { - continue; - } - field_found = 1; - aux_tcp_port |= j == af_tcp_port; - aux_tls_port |= j == af_tls_port; - if (auxFieldHandlers[j].setter(n, field_argv[1], sdslen(field_argv[1])) != C_OK) { - /* Invalid aux field format */ - sdsfreesplitres(field_argv, field_argc); - sdsfreesplitres(argv,argc); - goto fmterr; - } - } - - if (field_found == 0) { - /* Invalid aux field format */ - sdsfreesplitres(field_argv, field_argc); - sdsfreesplitres(argv,argc); - goto fmterr; - } - - sdsfreesplitres(field_argv, field_argc); - } - /* Address and port */ - if ((p = strrchr(aux_argv[0],':')) == NULL) { - sdsfreesplitres(aux_argv, aux_argc); - sdsfreesplitres(argv,argc); - goto fmterr; - } - *p = '\0'; - memcpy(n->ip,aux_argv[0],strlen(aux_argv[0])+1); - char *port = p+1; - char *busp = strchr(port,'@'); - if (busp) { - *busp = '\0'; - busp++; - } - /* If neither TCP or TLS port is found in aux field, it is considered - * an old version of nodes.conf file.*/ - if (!aux_tcp_port && !aux_tls_port) { - if (server.tls_cluster) { - n->tls_port = atoi(port); - } else { - n->tcp_port = atoi(port); - } - } else if (!aux_tcp_port) { - n->tcp_port = atoi(port); - } else if (!aux_tls_port) { - n->tls_port = atoi(port); - } - /* In older versions of nodes.conf the "@busport" part is missing. - * In this case we set it to the default offset of 10000 from the - * base port. */ - n->cport = busp ? atoi(busp) : (getNodeDefaultClientPort(n) + CLUSTER_PORT_INCR); - - /* The plaintext port for client in a TLS cluster (n->pport) is not - * stored in nodes.conf. It is received later over the bus protocol. */ - - sdsfreesplitres(aux_argv, aux_argc); - - /* Parse flags */ - p = s = argv[2]; - while(p) { - p = strchr(s,','); - if (p) *p = '\0'; - if (!strcasecmp(s,"myself")) { - serverAssert(server.cluster->myself == NULL); - myself = server.cluster->myself = n; - n->flags |= CLUSTER_NODE_MYSELF; - } else if (!strcasecmp(s,"master")) { - n->flags |= CLUSTER_NODE_MASTER; - } else if (!strcasecmp(s,"slave")) { - n->flags |= CLUSTER_NODE_SLAVE; - } else if (!strcasecmp(s,"fail?")) { - n->flags |= CLUSTER_NODE_PFAIL; - } else if (!strcasecmp(s,"fail")) { - n->flags |= CLUSTER_NODE_FAIL; - n->fail_time = mstime(); - } else if (!strcasecmp(s,"handshake")) { - n->flags |= CLUSTER_NODE_HANDSHAKE; - } else if (!strcasecmp(s,"noaddr")) { - n->flags |= CLUSTER_NODE_NOADDR; - } else if (!strcasecmp(s,"nofailover")) { - n->flags |= CLUSTER_NODE_NOFAILOVER; - } else if (!strcasecmp(s,"noflags")) { - /* nothing to do */ - } else { - serverPanic("Unknown flag in redis cluster config file"); - } - if (p) s = p+1; - } - - /* Get master if any. Set the master and populate master's - * slave list. */ - if (argv[3][0] != '-') { - if (verifyClusterNodeId(argv[3], sdslen(argv[3])) == C_ERR) { - sdsfreesplitres(argv, argc); - goto fmterr; - } - master = clusterLookupNode(argv[3], sdslen(argv[3])); - if (!master) { - master = createClusterNode(argv[3],0); - clusterAddNode(master); - } - /* shard_id can be absent if we are loading a nodes.conf generated - * by an older version of Redis; we should follow the primary's - * shard_id in this case */ - if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) { - memcpy(n->shard_id, master->shard_id, CLUSTER_NAMELEN); - clusterAddNodeToShard(master->shard_id, n); - } else if (clusterGetNodesInMyShard(master) != NULL && - memcmp(master->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) - { - /* If the primary has been added to a shard, make sure this - * node has the same persisted shard id as the primary. */ - goto fmterr; - } - n->slaveof = master; - clusterNodeAddSlave(master,n); - } else if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) { - /* n is a primary but it does not have a persisted shard_id. - * This happens if we are loading a nodes.conf generated by - * an older version of Redis. We should manually update the - * shard membership in this case */ - clusterAddNodeToShard(n->shard_id, n); - } - - /* Set ping sent / pong received timestamps */ - if (atoi(argv[4])) n->ping_sent = mstime(); - if (atoi(argv[5])) n->pong_received = mstime(); - - /* Set configEpoch for this node. - * If the node is a replica, set its config epoch to 0. - * If it's a primary, load the config epoch from the configuration file. */ - n->configEpoch = (nodeIsSlave(n) && n->slaveof) ? 0 : strtoull(argv[6],NULL,10); - - /* Populate hash slots served by this instance. */ - for (j = 8; j < argc; j++) { - int start, stop; - - if (argv[j][0] == '[') { - /* Here we handle migrating / importing slots */ - int slot; - char direction; - clusterNode *cn; - - p = strchr(argv[j],'-'); - serverAssert(p != NULL); - *p = '\0'; - direction = p[1]; /* Either '>' or '<' */ - slot = atoi(argv[j]+1); - if (slot < 0 || slot >= CLUSTER_SLOTS) { - sdsfreesplitres(argv,argc); - goto fmterr; - } - p += 3; - - char *pr = strchr(p, ']'); - size_t node_len = pr - p; - if (pr == NULL || verifyClusterNodeId(p, node_len) == C_ERR) { - sdsfreesplitres(argv, argc); - goto fmterr; - } - cn = clusterLookupNode(p, CLUSTER_NAMELEN); - if (!cn) { - cn = createClusterNode(p,0); - clusterAddNode(cn); - } - if (direction == '>') { - server.cluster->migrating_slots_to[slot] = cn; - } else { - server.cluster->importing_slots_from[slot] = cn; - } - continue; - } else if ((p = strchr(argv[j],'-')) != NULL) { - *p = '\0'; - start = atoi(argv[j]); - stop = atoi(p+1); - } else { - start = stop = atoi(argv[j]); - } - if (start < 0 || start >= CLUSTER_SLOTS || - stop < 0 || stop >= CLUSTER_SLOTS) - { - sdsfreesplitres(argv,argc); - goto fmterr; - } - while(start <= stop) clusterAddSlot(n, start++); - } - - sdsfreesplitres(argv,argc); - } - /* Config sanity check */ - if (server.cluster->myself == NULL) goto fmterr; - - zfree(line); - fclose(fp); - - serverLog(LL_NOTICE,"Node configuration loaded, I'm %.40s", myself->name); - - /* Something that should never happen: currentEpoch smaller than - * the max epoch found in the nodes configuration. However we handle this - * as some form of protection against manual editing of critical files. */ - if (clusterGetMaxEpoch() > server.cluster->currentEpoch) { - server.cluster->currentEpoch = clusterGetMaxEpoch(); - } - return C_OK; - -fmterr: - serverLog(LL_WARNING, - "Unrecoverable error: corrupted cluster config file \"%s\".", line); - zfree(line); - if (fp) fclose(fp); - exit(1); -} - -/* Cluster node configuration is exactly the same as CLUSTER NODES output. - * - * This function writes the node config and returns 0, on error -1 - * is returned. - * - * Note: we need to write the file in an atomic way from the point of view - * of the POSIX filesystem semantics, so that if the server is stopped - * or crashes during the write, we'll end with either the old file or the - * new one. Since we have the full payload to write available we can use - * a single write to write the whole file. If the pre-existing file was - * bigger we pad our payload with newlines that are anyway ignored and truncate - * the file afterward. */ -int clusterSaveConfig(int do_fsync) { - sds ci,tmpfilename; - size_t content_size,offset = 0; - ssize_t written_bytes; - int fd = -1; - int retval = C_ERR; - - server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG; - - /* Get the nodes description and concatenate our "vars" directive to - * save currentEpoch and lastVoteEpoch. */ - ci = clusterGenNodesDescription(NULL, CLUSTER_NODE_HANDSHAKE, 0); - ci = sdscatprintf(ci,"vars currentEpoch %llu lastVoteEpoch %llu\n", - (unsigned long long) server.cluster->currentEpoch, - (unsigned long long) server.cluster->lastVoteEpoch); - content_size = sdslen(ci); - - /* Create a temp file with the new content. */ - tmpfilename = sdscatfmt(sdsempty(),"%s.tmp-%i-%I", - server.cluster_configfile,(int) getpid(),mstime()); - if ((fd = open(tmpfilename,O_WRONLY|O_CREAT,0644)) == -1) { - serverLog(LL_WARNING,"Could not open temp cluster config file: %s",strerror(errno)); - goto cleanup; - } - - while (offset < content_size) { - written_bytes = write(fd,ci + offset,content_size - offset); - if (written_bytes <= 0) { - if (errno == EINTR) continue; - serverLog(LL_WARNING,"Failed after writing (%zd) bytes to tmp cluster config file: %s", - offset,strerror(errno)); - goto cleanup; - } - offset += written_bytes; - } - - if (do_fsync) { - server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG; - if (redis_fsync(fd) == -1) { - serverLog(LL_WARNING,"Could not sync tmp cluster config file: %s",strerror(errno)); - goto cleanup; - } - } - - if (rename(tmpfilename, server.cluster_configfile) == -1) { - serverLog(LL_WARNING,"Could not rename tmp cluster config file: %s",strerror(errno)); - goto cleanup; - } - - if (do_fsync) { - if (fsyncFileDir(server.cluster_configfile) == -1) { - serverLog(LL_WARNING,"Could not sync cluster config file dir: %s",strerror(errno)); - goto cleanup; - } - } - retval = C_OK; /* If we reached this point, everything is fine. */ - -cleanup: - if (fd != -1) close(fd); - if (retval) unlink(tmpfilename); - sdsfree(tmpfilename); - sdsfree(ci); - return retval; -} - -void clusterSaveConfigOrDie(int do_fsync) { - if (clusterSaveConfig(do_fsync) == -1) { - serverLog(LL_WARNING,"Fatal: can't update cluster config file."); - exit(1); - } -} - -/* Lock the cluster config using flock(), and retain the file descriptor used to - * acquire the lock so that the file will be locked as long as the process is up. - * - * This works because we always update nodes.conf with a new version - * in-place, reopening the file, and writing to it in place (later adjusting - * the length with ftruncate()). - * - * On success C_OK is returned, otherwise an error is logged and - * the function returns C_ERR to signal a lock was not acquired. */ -int clusterLockConfig(char *filename) { -/* flock() does not exist on Solaris - * and a fcntl-based solution won't help, as we constantly re-open that file, - * which will release _all_ locks anyway - */ -#if !defined(__sun) - /* To lock it, we need to open the file in a way it is created if - * it does not exist, otherwise there is a race condition with other - * processes. */ - int fd = open(filename,O_WRONLY|O_CREAT|O_CLOEXEC,0644); - if (fd == -1) { - serverLog(LL_WARNING, - "Can't open %s in order to acquire a lock: %s", - filename, strerror(errno)); - return C_ERR; - } - - if (flock(fd,LOCK_EX|LOCK_NB) == -1) { - if (errno == EWOULDBLOCK) { - serverLog(LL_WARNING, - "Sorry, the cluster configuration file %s is already used " - "by a different Redis Cluster node. Please make sure that " - "different nodes use different cluster configuration " - "files.", filename); - } else { - serverLog(LL_WARNING, - "Impossible to lock %s: %s", filename, strerror(errno)); - } - close(fd); - return C_ERR; - } - /* Lock acquired: leak the 'fd' by not closing it until shutdown time, so that - * we'll retain the lock to the file as long as the process exists. - * - * After fork, the child process will get the fd opened by the parent process, - * we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(), - * it will be closed in the child process. - * If it is not closed, when the main process is killed -9, but the child process - * (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the - * child process, and the main process will fail to get lock, means fail to start. */ - server.cluster_config_file_lock_fd = fd; -#else - UNUSED(filename); -#endif /* __sun */ - - return C_OK; -} - -/* Derives our ports to be announced in the cluster bus. */ -void deriveAnnouncedPorts(int *announced_tcp_port, int *announced_tls_port, - int *announced_cport) { - /* Config overriding announced ports. */ - *announced_tcp_port = server.cluster_announce_port ? - server.cluster_announce_port : server.port; - *announced_tls_port = server.cluster_announce_tls_port ? - server.cluster_announce_tls_port : server.tls_port; - /* Derive cluster bus port. */ - if (server.cluster_announce_bus_port) { - *announced_cport = server.cluster_announce_bus_port; - } else if (server.cluster_port) { - *announced_cport = server.cluster_port; - } else { - *announced_cport = defaultClientPort() + CLUSTER_PORT_INCR; - } -} - -/* Some flags (currently just the NOFAILOVER flag) may need to be updated - * in the "myself" node based on the current configuration of the node, - * that may change at runtime via CONFIG SET. This function changes the - * set of flags in myself->flags accordingly. */ -void clusterUpdateMyselfFlags(void) { - if (!myself) return; - int oldflags = myself->flags; - int nofailover = server.cluster_slave_no_failover ? - CLUSTER_NODE_NOFAILOVER : 0; - myself->flags &= ~CLUSTER_NODE_NOFAILOVER; - myself->flags |= nofailover; - if (myself->flags != oldflags) { - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } -} - - -/* We want to take myself->port/cport/pport in sync with the -* cluster-announce-port/cluster-announce-bus-port/cluster-announce-tls-port option. -* The option can be set at runtime via CONFIG SET. */ -void clusterUpdateMyselfAnnouncedPorts(void) { - if (!myself) return; - deriveAnnouncedPorts(&myself->tcp_port,&myself->tls_port,&myself->cport); -} - -/* We want to take myself->ip in sync with the cluster-announce-ip option. -* The option can be set at runtime via CONFIG SET. */ -void clusterUpdateMyselfIp(void) { - if (!myself) return; - static char *prev_ip = NULL; - char *curr_ip = server.cluster_announce_ip; - int changed = 0; - - if (prev_ip == NULL && curr_ip != NULL) changed = 1; - else if (prev_ip != NULL && curr_ip == NULL) changed = 1; - else if (prev_ip && curr_ip && strcmp(prev_ip,curr_ip)) changed = 1; - - if (changed) { - if (prev_ip) zfree(prev_ip); - prev_ip = curr_ip; - - if (curr_ip) { - /* We always take a copy of the previous IP address, by - * duplicating the string. This way later we can check if - * the address really changed. */ - prev_ip = zstrdup(prev_ip); - redis_strlcpy(myself->ip,server.cluster_announce_ip,NET_IP_STR_LEN); - } else { - myself->ip[0] = '\0'; /* Force autodetection. */ - } - } -} - -/* Update the hostname for the specified node with the provided C string. */ -static void updateAnnouncedHostname(clusterNode *node, char *new) { - /* Previous and new hostname are the same, no need to update. */ - if (new && !strcmp(new, node->hostname)) { - return; - } else if (!new && (sdslen(node->hostname) == 0)) { - return; - } - - if (new) { - node->hostname = sdscpy(node->hostname, new); - } else if (sdslen(node->hostname) != 0) { - sdsclear(node->hostname); - } - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); -} - -static void updateAnnouncedHumanNodename(clusterNode *node, char *new) { - if (new && !strcmp(new, node->human_nodename)) { - return; - } else if (!new && (sdslen(node->human_nodename) == 0)) { - return; - } - - if (new) { - node->human_nodename = sdscpy(node->human_nodename, new); - } else if (sdslen(node->human_nodename) != 0) { - sdsclear(node->human_nodename); - } - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); -} - - -static void updateShardId(clusterNode *node, const char *shard_id) { - if (memcmp(node->shard_id, shard_id, CLUSTER_NAMELEN) != 0) { - clusterRemoveNodeFromShard(node); - memcpy(node->shard_id, shard_id, CLUSTER_NAMELEN); - clusterAddNodeToShard(shard_id, node); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - } - if (myself != node && myself->slaveof == node) { - if (memcmp(myself->shard_id, shard_id, CLUSTER_NAMELEN) != 0) { - /* shard-id can diverge right after a rolling upgrade - * from pre-7.2 releases */ - clusterRemoveNodeFromShard(myself); - memcpy(myself->shard_id, shard_id, CLUSTER_NAMELEN); - clusterAddNodeToShard(shard_id, myself); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG); - } - } -} - -/* Update my hostname based on server configuration values */ -void clusterUpdateMyselfHostname(void) { - if (!myself) return; - updateAnnouncedHostname(myself, server.cluster_announce_hostname); -} - -void clusterUpdateMyselfHumanNodename(void) { - if (!myself) return; - updateAnnouncedHumanNodename(myself, server.cluster_announce_human_nodename); -} - -void clusterInit(void) { - int saveconf = 0; - - server.cluster = zmalloc(sizeof(clusterState)); - server.cluster->myself = NULL; - server.cluster->currentEpoch = 0; - server.cluster->state = CLUSTER_FAIL; - server.cluster->size = 1; - server.cluster->todo_before_sleep = 0; - server.cluster->nodes = dictCreate(&clusterNodesDictType); - server.cluster->shards = dictCreate(&clusterSdsToListType); - server.cluster->nodes_black_list = - dictCreate(&clusterNodesBlackListDictType); - server.cluster->failover_auth_time = 0; - server.cluster->failover_auth_count = 0; - server.cluster->failover_auth_rank = 0; - server.cluster->failover_auth_epoch = 0; - server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; - server.cluster->lastVoteEpoch = 0; - - /* Initialize stats */ - for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { - server.cluster->stats_bus_messages_sent[i] = 0; - server.cluster->stats_bus_messages_received[i] = 0; - } - server.cluster->stats_pfail_nodes = 0; - server.cluster->stat_cluster_links_buffer_limit_exceeded = 0; - - memset(server.cluster->slots,0, sizeof(server.cluster->slots)); - clusterCloseAllSlots(); - - /* Lock the cluster config file to make sure every node uses - * its own nodes.conf. */ - server.cluster_config_file_lock_fd = -1; - if (clusterLockConfig(server.cluster_configfile) == C_ERR) - exit(1); - - /* Load or create a new nodes configuration. */ - if (clusterLoadConfig(server.cluster_configfile) == C_ERR) { - /* No configuration found. We will just use the random name provided - * by the createClusterNode() function. */ - myself = server.cluster->myself = - createClusterNode(NULL,CLUSTER_NODE_MYSELF|CLUSTER_NODE_MASTER); - serverLog(LL_NOTICE,"No cluster configuration found, I'm %.40s", - myself->name); - clusterAddNode(myself); - clusterAddNodeToShard(myself->shard_id, myself); - saveconf = 1; - } - if (saveconf) clusterSaveConfigOrDie(1); - - /* Port sanity check II - * The other handshake port check is triggered too late to stop - * us from trying to use a too-high cluster port number. */ - int port = defaultClientPort(); - if (!server.cluster_port && port > (65535-CLUSTER_PORT_INCR)) { - serverLog(LL_WARNING, "Redis port number too high. " - "Cluster communication port is 10,000 port " - "numbers higher than your Redis port. " - "Your Redis port number must be 55535 or less."); - exit(1); - } - if (!server.bindaddr_count) { - serverLog(LL_WARNING, "No bind address is configured, but it is required for the Cluster bus."); - exit(1); - } - - /* Initialize data for the Slot to key API. */ - slotToKeyInit(server.db); - - /* The slots -> channels map is a radix tree. Initialize it here. */ - server.cluster->slots_to_channels = raxNew(); - - /* Set myself->port/cport/pport to my listening ports, we'll just need to - * discover the IP address via MEET messages. */ - deriveAnnouncedPorts(&myself->tcp_port, &myself->tls_port, &myself->cport); - - server.cluster->mf_end = 0; - server.cluster->mf_slave = NULL; - resetManualFailover(); - clusterUpdateMyselfFlags(); - clusterUpdateMyselfIp(); - clusterUpdateMyselfHostname(); - clusterUpdateMyselfHumanNodename(); -} - -void clusterInitListeners(void) { - if (connectionIndexByType(connTypeOfCluster()->get_type(NULL)) < 0) { - serverLog(LL_WARNING, "Missing connection type %s, but it is required for the Cluster bus.", connTypeOfCluster()->get_type(NULL)); - exit(1); - } - - int port = defaultClientPort(); - connListener *listener = &server.clistener; - listener->count = 0; - listener->bindaddr = server.bindaddr; - listener->bindaddr_count = server.bindaddr_count; - listener->port = server.cluster_port ? server.cluster_port : port + CLUSTER_PORT_INCR; - listener->ct = connTypeOfCluster(); - if (connListen(listener) == C_ERR ) { - /* Note: the following log text is matched by the test suite. */ - serverLog(LL_WARNING, "Failed listening on port %u (cluster), aborting.", listener->port); - exit(1); - } - - if (createSocketAcceptHandler(&server.clistener, clusterAcceptHandler) != C_OK) { - serverPanic("Unrecoverable error creating Redis Cluster socket accept handler."); - } -} - -/* Reset a node performing a soft or hard reset: - * - * 1) All other nodes are forgotten. - * 2) All the assigned / open slots are released. - * 3) If the node is a slave, it turns into a master. - * 4) Only for hard reset: a new Node ID is generated. - * 5) Only for hard reset: currentEpoch and configEpoch are set to 0. - * 6) The new configuration is saved and the cluster state updated. - * 7) If the node was a slave, the whole data set is flushed away. */ -void clusterReset(int hard) { - dictIterator *di; - dictEntry *de; - int j; - - /* Turn into master. */ - if (nodeIsSlave(myself)) { - clusterSetNodeAsMaster(myself); - replicationUnsetMaster(); - emptyData(-1,EMPTYDB_NO_FLAGS,NULL); - } - - /* Close slots, reset manual failover state. */ - clusterCloseAllSlots(); - resetManualFailover(); - - /* Unassign all the slots. */ - for (j = 0; j < CLUSTER_SLOTS; j++) clusterDelSlot(j); - - /* Recreate shards dict */ - dictEmpty(server.cluster->shards, NULL); - - /* Forget all the nodes, but myself. */ - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (node == myself) continue; - clusterDelNode(node); - } - dictReleaseIterator(di); - - /* Hard reset only: set epochs to 0, change node ID. */ - if (hard) { - sds oldname; - - server.cluster->currentEpoch = 0; - server.cluster->lastVoteEpoch = 0; - myself->configEpoch = 0; - serverLog(LL_NOTICE, "configEpoch set to 0 via CLUSTER RESET HARD"); - - /* To change the Node ID we need to remove the old name from the - * nodes table, change the ID, and re-add back with new name. */ - oldname = sdsnewlen(myself->name, CLUSTER_NAMELEN); - dictDelete(server.cluster->nodes,oldname); - sdsfree(oldname); - getRandomHexChars(myself->name, CLUSTER_NAMELEN); - getRandomHexChars(myself->shard_id, CLUSTER_NAMELEN); - clusterAddNode(myself); - serverLog(LL_NOTICE,"Node hard reset, now I'm %.40s", myself->name); - } - - /* Re-populate shards */ - clusterAddNodeToShard(myself->shard_id, myself); - - /* Make sure to persist the new config and update the state. */ - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); -} - -/* ----------------------------------------------------------------------------- - * CLUSTER communication link - * -------------------------------------------------------------------------- */ -static clusterMsgSendBlock *createClusterMsgSendBlock(int type, uint32_t msglen) { - uint32_t blocklen = msglen + sizeof(clusterMsgSendBlock) - sizeof(clusterMsg); - clusterMsgSendBlock *msgblock = zcalloc(blocklen); - msgblock->refcount = 1; - msgblock->totlen = blocklen; - server.stat_cluster_links_memory += blocklen; - clusterBuildMessageHdr(&msgblock->msg,type,msglen); - return msgblock; -} - -static void clusterMsgSendBlockDecrRefCount(void *node) { - clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)node; - msgblock->refcount--; - serverAssert(msgblock->refcount >= 0); - if (msgblock->refcount == 0) { - server.stat_cluster_links_memory -= msgblock->totlen; - zfree(msgblock); - } -} - -clusterLink *createClusterLink(clusterNode *node) { - clusterLink *link = zmalloc(sizeof(*link)); - link->ctime = mstime(); - link->send_msg_queue = listCreate(); - listSetFreeMethod(link->send_msg_queue, clusterMsgSendBlockDecrRefCount); - link->head_msg_send_offset = 0; - link->send_msg_queue_mem = sizeof(list); - link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN); - link->rcvbuf_len = 0; - server.stat_cluster_links_memory += link->rcvbuf_alloc + link->send_msg_queue_mem; - link->conn = NULL; - link->node = node; - /* Related node can only possibly be known at link creation time if this is an outbound link */ - link->inbound = (node == NULL); - if (!link->inbound) { - node->link = link; - } - return link; -} - -/* Free a cluster link, but does not free the associated node of course. - * This function will just make sure that the original node associated - * with this link will have the 'link' field set to NULL. */ -void freeClusterLink(clusterLink *link) { - if (link->conn) { - connClose(link->conn); - link->conn = NULL; - } - server.stat_cluster_links_memory -= sizeof(list) + listLength(link->send_msg_queue)*sizeof(listNode); - listRelease(link->send_msg_queue); - server.stat_cluster_links_memory -= link->rcvbuf_alloc; - zfree(link->rcvbuf); - if (link->node) { - if (link->node->link == link) { - serverAssert(!link->inbound); - link->node->link = NULL; - } else if (link->node->inbound_link == link) { - serverAssert(link->inbound); - link->node->inbound_link = NULL; - } - } - zfree(link); -} - -void setClusterNodeToInboundClusterLink(clusterNode *node, clusterLink *link) { - serverAssert(!link->node); - serverAssert(link->inbound); - if (node->inbound_link) { - /* A peer may disconnect and then reconnect with us, and it's not guaranteed that - * we would always process the disconnection of the existing inbound link before - * accepting a new existing inbound link. Therefore, it's possible to have more than - * one inbound link from the same node at the same time. Our cleanup logic assumes - * a one to one relationship between nodes and inbound links, so we need to kill - * one of the links. The existing link is more likely the outdated one, but it's - * possible the other node may need to open another link. */ - serverLog(LL_DEBUG, "Replacing inbound link fd %d from node %.40s with fd %d", - node->inbound_link->conn->fd, node->name, link->conn->fd); - freeClusterLink(node->inbound_link); - } - serverAssert(!node->inbound_link); - node->inbound_link = link; - link->node = node; -} - -static void clusterConnAcceptHandler(connection *conn) { - clusterLink *link; - - if (connGetState(conn) != CONN_STATE_CONNECTED) { - serverLog(LL_VERBOSE, - "Error accepting cluster node connection: %s", connGetLastError(conn)); - connClose(conn); - return; - } - - /* Create a link object we use to handle the connection. - * It gets passed to the readable handler when data is available. - * Initially the link->node pointer is set to NULL as we don't know - * which node is, but the right node is references once we know the - * node identity. */ - link = createClusterLink(NULL); - link->conn = conn; - connSetPrivateData(conn, link); - - /* Register read handler */ - connSetReadHandler(conn, clusterReadHandler); -} - -#define MAX_CLUSTER_ACCEPTS_PER_CALL 1000 -void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { - int cport, cfd; - int max = MAX_CLUSTER_ACCEPTS_PER_CALL; - char cip[NET_IP_STR_LEN]; - int require_auth = TLS_CLIENT_AUTH_YES; - UNUSED(el); - UNUSED(mask); - UNUSED(privdata); - - /* If the server is starting up, don't accept cluster connections: - * UPDATE messages may interact with the database content. */ - if (server.masterhost == NULL && server.loading) return; - - while(max--) { - cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport); - if (cfd == ANET_ERR) { - if (errno != EWOULDBLOCK) - serverLog(LL_VERBOSE, - "Error accepting cluster node: %s", server.neterr); - return; - } - - connection *conn = connCreateAccepted(connTypeOfCluster(), cfd, &require_auth); - - /* Make sure connection is not in an error state */ - if (connGetState(conn) != CONN_STATE_ACCEPTING) { - serverLog(LL_VERBOSE, - "Error creating an accepting connection for cluster node: %s", - connGetLastError(conn)); - connClose(conn); - return; - } - connEnableTcpNoDelay(conn); - connKeepAlive(conn,server.cluster_node_timeout / 1000 * 2); - - /* Use non-blocking I/O for cluster messages. */ - serverLog(LL_VERBOSE,"Accepting cluster node connection from %s:%d", cip, cport); - - /* Accept the connection now. connAccept() may call our handler directly - * or schedule it for later depending on connection implementation. - */ - if (connAccept(conn, clusterConnAcceptHandler) == C_ERR) { - if (connGetState(conn) == CONN_STATE_ERROR) - serverLog(LL_VERBOSE, - "Error accepting cluster node connection: %s", - connGetLastError(conn)); - connClose(conn); - return; - } - } -} - -/* Return the approximated number of sockets we are using in order to - * take the cluster bus connections. */ -unsigned long getClusterConnectionsCount(void) { - /* We decrement the number of nodes by one, since there is the - * "myself" node too in the list. Each node uses two file descriptors, - * one incoming and one outgoing, thus the multiplication by 2. */ - return server.cluster_enabled ? - ((dictSize(server.cluster->nodes)-1)*2) : 0; -} - -/* ----------------------------------------------------------------------------- - * Key space handling - * -------------------------------------------------------------------------- */ - -/* We have 16384 hash slots. The hash slot of a given key is obtained - * as the least significant 14 bits of the crc16 of the key. - * - * However if the key contains the {...} pattern, only the part between - * { and } is hashed. This may be useful in the future to force certain - * keys to be in the same node (assuming no resharding is in progress). */ -unsigned int keyHashSlot(char *key, int keylen) { - int s, e; /* start-end indexes of { and } */ - - for (s = 0; s < keylen; s++) - if (key[s] == '{') break; - - /* No '{' ? Hash the whole key. This is the base case. */ - if (s == keylen) return crc16(key,keylen) & 0x3FFF; - - /* '{' found? Check if we have the corresponding '}'. */ - for (e = s+1; e < keylen; e++) - if (key[e] == '}') break; - - /* No '}' or nothing between {} ? Hash the whole key. */ - if (e == keylen || e == s+1) return crc16(key,keylen) & 0x3FFF; - - /* If we are here there is both a { and a } on its right. Hash - * what is in the middle between { and }. */ - return crc16(key+s+1,e-s-1) & 0x3FFF; -} - -/* ----------------------------------------------------------------------------- - * CLUSTER node API - * -------------------------------------------------------------------------- */ - -/* Create a new cluster node, with the specified flags. - * If "nodename" is NULL this is considered a first handshake and a random - * node name is assigned to this node (it will be fixed later when we'll - * receive the first pong). - * - * The node is created and returned to the user, but it is not automatically - * added to the nodes hash table. */ -clusterNode *createClusterNode(char *nodename, int flags) { - clusterNode *node = zmalloc(sizeof(*node)); - - if (nodename) - memcpy(node->name, nodename, CLUSTER_NAMELEN); - else - getRandomHexChars(node->name, CLUSTER_NAMELEN); - getRandomHexChars(node->shard_id, CLUSTER_NAMELEN); - node->ctime = mstime(); - node->configEpoch = 0; - node->flags = flags; - memset(node->slots,0,sizeof(node->slots)); - node->slot_info_pairs = NULL; - node->slot_info_pairs_count = 0; - node->numslots = 0; - node->numslaves = 0; - node->slaves = NULL; - node->slaveof = NULL; - node->last_in_ping_gossip = 0; - node->ping_sent = node->pong_received = 0; - node->data_received = 0; - node->fail_time = 0; - node->link = NULL; - node->inbound_link = NULL; - memset(node->ip,0,sizeof(node->ip)); - node->hostname = sdsempty(); - node->human_nodename = sdsempty(); - node->tcp_port = 0; - node->cport = 0; - node->tls_port = 0; - node->fail_reports = listCreate(); - node->voted_time = 0; - node->orphaned_time = 0; - node->repl_offset_time = 0; - node->repl_offset = 0; - listSetFreeMethod(node->fail_reports,zfree); - return node; -} - -/* This function is called every time we get a failure report from a node. - * The side effect is to populate the fail_reports list (or to update - * the timestamp of an existing report). - * - * 'failing' is the node that is in failure state according to the - * 'sender' node. - * - * The function returns 0 if it just updates a timestamp of an existing - * failure report from the same sender. 1 is returned if a new failure - * report is created. */ -int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) { - list *l = failing->fail_reports; - listNode *ln; - listIter li; - clusterNodeFailReport *fr; - - /* If a failure report from the same sender already exists, just update - * the timestamp. */ - listRewind(l,&li); - while ((ln = listNext(&li)) != NULL) { - fr = ln->value; - if (fr->node == sender) { - fr->time = mstime(); - return 0; - } - } - - /* Otherwise create a new report. */ - fr = zmalloc(sizeof(*fr)); - fr->node = sender; - fr->time = mstime(); - listAddNodeTail(l,fr); - return 1; -} - -/* Remove failure reports that are too old, where too old means reasonably - * older than the global node timeout. Note that anyway for a node to be - * flagged as FAIL we need to have a local PFAIL state that is at least - * older than the global node timeout, so we don't just trust the number - * of failure reports from other nodes. */ -void clusterNodeCleanupFailureReports(clusterNode *node) { - list *l = node->fail_reports; - listNode *ln; - listIter li; - clusterNodeFailReport *fr; - mstime_t maxtime = server.cluster_node_timeout * - CLUSTER_FAIL_REPORT_VALIDITY_MULT; - mstime_t now = mstime(); - - listRewind(l,&li); - while ((ln = listNext(&li)) != NULL) { - fr = ln->value; - if (now - fr->time > maxtime) listDelNode(l,ln); - } -} - -/* Remove the failing report for 'node' if it was previously considered - * failing by 'sender'. This function is called when a node informs us via - * gossip that a node is OK from its point of view (no FAIL or PFAIL flags). - * - * Note that this function is called relatively often as it gets called even - * when there are no nodes failing, and is O(N), however when the cluster is - * fine the failure reports list is empty so the function runs in constant - * time. - * - * The function returns 1 if the failure report was found and removed. - * Otherwise 0 is returned. */ -int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) { - list *l = node->fail_reports; - listNode *ln; - listIter li; - clusterNodeFailReport *fr; - - /* Search for a failure report from this sender. */ - listRewind(l,&li); - while ((ln = listNext(&li)) != NULL) { - fr = ln->value; - if (fr->node == sender) break; - } - if (!ln) return 0; /* No failure report from this sender. */ - - /* Remove the failure report. */ - listDelNode(l,ln); - clusterNodeCleanupFailureReports(node); - return 1; -} - -/* Return the number of external nodes that believe 'node' is failing, - * not including this node, that may have a PFAIL or FAIL state for this - * node as well. */ -int clusterNodeFailureReportsCount(clusterNode *node) { - clusterNodeCleanupFailureReports(node); - return listLength(node->fail_reports); -} - -int clusterNodeRemoveSlave(clusterNode *master, clusterNode *slave) { - int j; - - for (j = 0; j < master->numslaves; j++) { - if (master->slaves[j] == slave) { - if ((j+1) < master->numslaves) { - int remaining_slaves = (master->numslaves - j) - 1; - memmove(master->slaves+j,master->slaves+(j+1), - (sizeof(*master->slaves) * remaining_slaves)); - } - master->numslaves--; - if (master->numslaves == 0) - master->flags &= ~CLUSTER_NODE_MIGRATE_TO; - return C_OK; - } - } - return C_ERR; -} - -int clusterNodeAddSlave(clusterNode *master, clusterNode *slave) { - int j; - - /* If it's already a slave, don't add it again. */ - for (j = 0; j < master->numslaves; j++) - if (master->slaves[j] == slave) return C_ERR; - master->slaves = zrealloc(master->slaves, - sizeof(clusterNode*)*(master->numslaves+1)); - master->slaves[master->numslaves] = slave; - master->numslaves++; - master->flags |= CLUSTER_NODE_MIGRATE_TO; - return C_OK; -} - -int clusterCountNonFailingSlaves(clusterNode *n) { - int j, okslaves = 0; - - for (j = 0; j < n->numslaves; j++) - if (!nodeFailed(n->slaves[j])) okslaves++; - return okslaves; -} - -/* Low level cleanup of the node structure. Only called by clusterDelNode(). */ -void freeClusterNode(clusterNode *n) { - sds nodename; - int j; - - /* If the node has associated slaves, we have to set - * all the slaves->slaveof fields to NULL (unknown). */ - for (j = 0; j < n->numslaves; j++) - n->slaves[j]->slaveof = NULL; - - /* Remove this node from the list of slaves of its master. */ - if (nodeIsSlave(n) && n->slaveof) clusterNodeRemoveSlave(n->slaveof,n); - - /* Unlink from the set of nodes. */ - nodename = sdsnewlen(n->name, CLUSTER_NAMELEN); - serverAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK); - sdsfree(nodename); - sdsfree(n->hostname); - sdsfree(n->human_nodename); - - /* Release links and associated data structures. */ - if (n->link) freeClusterLink(n->link); - if (n->inbound_link) freeClusterLink(n->inbound_link); - listRelease(n->fail_reports); - zfree(n->slaves); - zfree(n); -} - -/* Add a node to the nodes hash table */ -void clusterAddNode(clusterNode *node) { - int retval; - - retval = dictAdd(server.cluster->nodes, - sdsnewlen(node->name,CLUSTER_NAMELEN), node); - serverAssert(retval == DICT_OK); -} - -/* Remove a node from the cluster. The function performs the high level - * cleanup, calling freeClusterNode() for the low level cleanup. - * Here we do the following: - * - * 1) Mark all the slots handled by it as unassigned. - * 2) Remove all the failure reports sent by this node and referenced by - * other nodes. - * 3) Remove the node from the owning shard - * 4) Free the node with freeClusterNode() that will in turn remove it - * from the hash table and from the list of slaves of its master, if - * it is a slave node. - */ -void clusterDelNode(clusterNode *delnode) { - int j; - dictIterator *di; - dictEntry *de; - - /* 1) Mark slots as unassigned. */ - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (server.cluster->importing_slots_from[j] == delnode) - server.cluster->importing_slots_from[j] = NULL; - if (server.cluster->migrating_slots_to[j] == delnode) - server.cluster->migrating_slots_to[j] = NULL; - if (server.cluster->slots[j] == delnode) - clusterDelSlot(j); - } - - /* 2) Remove failure reports. */ - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (node == delnode) continue; - clusterNodeDelFailureReport(node,delnode); - } - dictReleaseIterator(di); - - /* 3) Remove the node from the owning shard */ - clusterRemoveNodeFromShard(delnode); - - /* 4) Free the node, unlinking it from the cluster. */ - freeClusterNode(delnode); -} - -/* Cluster node sanity check. Returns C_OK if the node id - * is valid an C_ERR otherwise. */ -int verifyClusterNodeId(const char *name, int length) { - if (length != CLUSTER_NAMELEN) return C_ERR; - for (int i = 0; i < length; i++) { - if (name[i] >= 'a' && name[i] <= 'z') continue; - if (name[i] >= '0' && name[i] <= '9') continue; - return C_ERR; - } - return C_OK; -} - -/* Node lookup by name */ -clusterNode *clusterLookupNode(const char *name, int length) { - if (verifyClusterNodeId(name, length) != C_OK) return NULL; - sds s = sdsnewlen(name, length); - dictEntry *de = dictFind(server.cluster->nodes, s); - sdsfree(s); - if (de == NULL) return NULL; - return dictGetVal(de); -} - -/* Get all the nodes in my shard. - * Note that the list returned is not computed on the fly - * via slaveof; rather, it is maintained permanently to - * track the shard membership and its life cycle is tied - * to this Redis process. Therefore, the caller must not - * release the list. */ -list *clusterGetNodesInMyShard(clusterNode *node) { - sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN); - dictEntry *de = dictFind(server.cluster->shards,s); - sdsfree(s); - return (de != NULL) ? dictGetVal(de) : NULL; -} - -/* This is only used after the handshake. When we connect a given IP/PORT - * as a result of CLUSTER MEET we don't have the node name yet, so we - * pick a random one, and will fix it when we receive the PONG request using - * this function. */ -void clusterRenameNode(clusterNode *node, char *newname) { - int retval; - sds s = sdsnewlen(node->name, CLUSTER_NAMELEN); - - serverLog(LL_DEBUG,"Renaming node %.40s into %.40s", - node->name, newname); - retval = dictDelete(server.cluster->nodes, s); - sdsfree(s); - serverAssert(retval == DICT_OK); - memcpy(node->name, newname, CLUSTER_NAMELEN); - clusterAddNode(node); -} - -void clusterAddNodeToShard(const char *shard_id, clusterNode *node) { - sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); - dictEntry *de = dictFind(server.cluster->shards,s); - if (de == NULL) { - list *l = listCreate(); - listAddNodeTail(l, node); - serverAssert(dictAdd(server.cluster->shards, s, l) == DICT_OK); - } else { - list *l = dictGetVal(de); - if (listSearchKey(l, node) == NULL) { - listAddNodeTail(l, node); - } - sdsfree(s); - } -} - -void clusterRemoveNodeFromShard(clusterNode *node) { - sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN); - dictEntry *de = dictFind(server.cluster->shards, s); - if (de != NULL) { - list *l = dictGetVal(de); - listNode *ln = listSearchKey(l, node); - if (ln != NULL) { - listDelNode(l, ln); - } - if (listLength(l) == 0) { - dictDelete(server.cluster->shards, s); - } - } - sdsfree(s); -} - -/* ----------------------------------------------------------------------------- - * CLUSTER config epoch handling - * -------------------------------------------------------------------------- */ - -/* Return the greatest configEpoch found in the cluster, or the current - * epoch if greater than any node configEpoch. */ -uint64_t clusterGetMaxEpoch(void) { - uint64_t max = 0; - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - if (node->configEpoch > max) max = node->configEpoch; - } - dictReleaseIterator(di); - if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch; - return max; -} - -/* If this node epoch is zero or is not already the greatest across the - * cluster (from the POV of the local configuration), this function will: - * - * 1) Generate a new config epoch, incrementing the current epoch. - * 2) Assign the new epoch to this node, WITHOUT any consensus. - * 3) Persist the configuration on disk before sending packets with the - * new configuration. - * - * If the new config epoch is generated and assigned, C_OK is returned, - * otherwise C_ERR is returned (since the node has already the greatest - * configuration around) and no operation is performed. - * - * Important note: this function violates the principle that config epochs - * should be generated with consensus and should be unique across the cluster. - * However Redis Cluster uses this auto-generated new config epochs in two - * cases: - * - * 1) When slots are closed after importing. Otherwise resharding would be - * too expensive. - * 2) When CLUSTER FAILOVER is called with options that force a slave to - * failover its master even if there is not master majority able to - * create a new configuration epoch. - * - * Redis Cluster will not explode using this function, even in the case of - * a collision between this node and another node, generating the same - * configuration epoch unilaterally, because the config epoch conflict - * resolution algorithm will eventually move colliding nodes to different - * config epochs. However using this function may violate the "last failover - * wins" rule, so should only be used with care. */ -int clusterBumpConfigEpochWithoutConsensus(void) { - uint64_t maxEpoch = clusterGetMaxEpoch(); - - if (myself->configEpoch == 0 || - myself->configEpoch != maxEpoch) - { - server.cluster->currentEpoch++; - myself->configEpoch = server.cluster->currentEpoch; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_FSYNC_CONFIG); - serverLog(LL_NOTICE, - "New configEpoch set to %llu", - (unsigned long long) myself->configEpoch); - return C_OK; - } else { - return C_ERR; - } -} - -/* This function is called when this node is a master, and we receive from - * another master a configuration epoch that is equal to our configuration - * epoch. - * - * BACKGROUND - * - * It is not possible that different slaves get the same config - * epoch during a failover election, because the slaves need to get voted - * by a majority. However when we perform a manual resharding of the cluster - * the node will assign a configuration epoch to itself without to ask - * for agreement. Usually resharding happens when the cluster is working well - * and is supervised by the sysadmin, however it is possible for a failover - * to happen exactly while the node we are resharding a slot to assigns itself - * a new configuration epoch, but before it is able to propagate it. - * - * So technically it is possible in this condition that two nodes end with - * the same configuration epoch. - * - * Another possibility is that there are bugs in the implementation causing - * this to happen. - * - * Moreover when a new cluster is created, all the nodes start with the same - * configEpoch. This collision resolution code allows nodes to automatically - * end with a different configEpoch at startup automatically. - * - * In all the cases, we want a mechanism that resolves this issue automatically - * as a safeguard. The same configuration epoch for masters serving different - * set of slots is not harmful, but it is if the nodes end serving the same - * slots for some reason (manual errors or software bugs) without a proper - * failover procedure. - * - * In general we want a system that eventually always ends with different - * masters having different configuration epochs whatever happened, since - * nothing is worse than a split-brain condition in a distributed system. - * - * BEHAVIOR - * - * When this function gets called, what happens is that if this node - * has the lexicographically smaller Node ID compared to the other node - * with the conflicting epoch (the 'sender' node), it will assign itself - * the greatest configuration epoch currently detected among nodes plus 1. - * - * This means that even if there are multiple nodes colliding, the node - * with the greatest Node ID never moves forward, so eventually all the nodes - * end with a different configuration epoch. - */ -void clusterHandleConfigEpochCollision(clusterNode *sender) { - /* Prerequisites: nodes have the same configEpoch and are both masters. */ - if (sender->configEpoch != myself->configEpoch || - !nodeIsMaster(sender) || !nodeIsMaster(myself)) return; - /* Don't act if the colliding node has a smaller Node ID. */ - if (memcmp(sender->name,myself->name,CLUSTER_NAMELEN) <= 0) return; - /* Get the next ID available at the best of this node knowledge. */ - server.cluster->currentEpoch++; - myself->configEpoch = server.cluster->currentEpoch; - clusterSaveConfigOrDie(1); - serverLog(LL_VERBOSE, - "WARNING: configEpoch collision with node %.40s (%s)." - " configEpoch set to %llu", - sender->name,sender->human_nodename, - (unsigned long long) myself->configEpoch); -} - -/* ----------------------------------------------------------------------------- - * CLUSTER nodes blacklist - * - * The nodes blacklist is just a way to ensure that a given node with a given - * Node ID is not re-added before some time elapsed (this time is specified - * in seconds in CLUSTER_BLACKLIST_TTL). - * - * This is useful when we want to remove a node from the cluster completely: - * when CLUSTER FORGET is called, it also puts the node into the blacklist so - * that even if we receive gossip messages from other nodes that still remember - * about the node we want to remove, we don't re-add it before some time. - * - * Currently the CLUSTER_BLACKLIST_TTL is set to 1 minute, this means - * that redis-cli has 60 seconds to send CLUSTER FORGET messages to nodes - * in the cluster without dealing with the problem of other nodes re-adding - * back the node to nodes we already sent the FORGET command to. - * - * The data structure used is a hash table with an sds string representing - * the node ID as key, and the time when it is ok to re-add the node as - * value. - * -------------------------------------------------------------------------- */ - -#define CLUSTER_BLACKLIST_TTL 60 /* 1 minute. */ - - -/* Before of the addNode() or Exists() operations we always remove expired - * entries from the black list. This is an O(N) operation but it is not a - * problem since add / exists operations are called very infrequently and - * the hash table is supposed to contain very little elements at max. - * However without the cleanup during long uptime and with some automated - * node add/removal procedures, entries could accumulate. */ -void clusterBlacklistCleanup(void) { - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes_black_list); - while((de = dictNext(di)) != NULL) { - int64_t expire = dictGetUnsignedIntegerVal(de); - - if (expire < server.unixtime) - dictDelete(server.cluster->nodes_black_list,dictGetKey(de)); - } - dictReleaseIterator(di); -} - -/* Cleanup the blacklist and add a new node ID to the black list. */ -void clusterBlacklistAddNode(clusterNode *node) { - dictEntry *de; - sds id = sdsnewlen(node->name,CLUSTER_NAMELEN); - - clusterBlacklistCleanup(); - if (dictAdd(server.cluster->nodes_black_list,id,NULL) == DICT_OK) { - /* If the key was added, duplicate the sds string representation of - * the key for the next lookup. We'll free it at the end. */ - id = sdsdup(id); - } - de = dictFind(server.cluster->nodes_black_list,id); - dictSetUnsignedIntegerVal(de,time(NULL)+CLUSTER_BLACKLIST_TTL); - sdsfree(id); -} - -/* Return non-zero if the specified node ID exists in the blacklist. - * You don't need to pass an sds string here, any pointer to 40 bytes - * will work. */ -int clusterBlacklistExists(char *nodeid) { - sds id = sdsnewlen(nodeid,CLUSTER_NAMELEN); - int retval; - - clusterBlacklistCleanup(); - retval = dictFind(server.cluster->nodes_black_list,id) != NULL; - sdsfree(id); - return retval; -} - -/* ----------------------------------------------------------------------------- - * CLUSTER messages exchange - PING/PONG and gossip - * -------------------------------------------------------------------------- */ - -/* This function checks if a given node should be marked as FAIL. - * It happens if the following conditions are met: - * - * 1) We received enough failure reports from other master nodes via gossip. - * Enough means that the majority of the masters signaled the node is - * down recently. - * 2) We believe this node is in PFAIL state. - * - * If a failure is detected we also inform the whole cluster about this - * event trying to force every other node to set the FAIL flag for the node. - * - * Note that the form of agreement used here is weak, as we collect the majority - * of masters state during some time, and even if we force agreement by - * propagating the FAIL message, because of partitions we may not reach every - * node. However: - * - * 1) Either we reach the majority and eventually the FAIL state will propagate - * to all the cluster. - * 2) Or there is no majority so no slave promotion will be authorized and the - * FAIL flag will be cleared after some time. - */ -void markNodeAsFailingIfNeeded(clusterNode *node) { - int failures; - int needed_quorum = (server.cluster->size / 2) + 1; - - if (!nodeTimedOut(node)) return; /* We can reach it. */ - if (nodeFailed(node)) return; /* Already FAILing. */ - - failures = clusterNodeFailureReportsCount(node); - /* Also count myself as a voter if I'm a master. */ - if (nodeIsMaster(myself)) failures++; - if (failures < needed_quorum) return; /* No weak agreement from masters. */ - - serverLog(LL_NOTICE, - "Marking node %.40s (%s) as failing (quorum reached).", node->name, node->human_nodename); - - /* Mark the node as failing. */ - node->flags &= ~CLUSTER_NODE_PFAIL; - node->flags |= CLUSTER_NODE_FAIL; - node->fail_time = mstime(); - - /* Broadcast the failing node name to everybody, forcing all the other - * reachable nodes to flag the node as FAIL. - * We do that even if this node is a replica and not a master: anyway - * the failing state is triggered collecting failure reports from masters, - * so here the replica is only helping propagating this status. */ - clusterSendFail(node->name); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); -} - -/* This function is called only if a node is marked as FAIL, but we are able - * to reach it again. It checks if there are the conditions to undo the FAIL - * state. */ -void clearNodeFailureIfNeeded(clusterNode *node) { - mstime_t now = mstime(); - - serverAssert(nodeFailed(node)); - - /* For slaves we always clear the FAIL flag if we can contact the - * node again. */ - if (nodeIsSlave(node) || node->numslots == 0) { - serverLog(LL_NOTICE, - "Clear FAIL state for node %.40s (%s):%s is reachable again.", - node->name,node->human_nodename, - nodeIsSlave(node) ? "replica" : "master without slots"); - node->flags &= ~CLUSTER_NODE_FAIL; - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - } - - /* If it is a master and... - * 1) The FAIL state is old enough. - * 2) It is yet serving slots from our point of view (not failed over). - * Apparently no one is going to fix these slots, clear the FAIL flag. */ - if (nodeIsMaster(node) && node->numslots > 0 && - (now - node->fail_time) > - (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT)) - { - serverLog(LL_NOTICE, - "Clear FAIL state for node %.40s (%s): is reachable again and nobody is serving its slots after some time.", - node->name, node->human_nodename); - node->flags &= ~CLUSTER_NODE_FAIL; - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - } -} - -/* Return true if we already have a node in HANDSHAKE state matching the - * specified ip address and port number. This function is used in order to - * avoid adding a new handshake node for the same address multiple times. */ -int clusterHandshakeInProgress(char *ip, int port, int cport) { - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (!nodeInHandshake(node)) continue; - if (!strcasecmp(node->ip,ip) && - getNodeDefaultClientPort(node) == port && - node->cport == cport) break; - } - dictReleaseIterator(di); - return de != NULL; -} - -/* Start a handshake with the specified address if there is not one - * already in progress. Returns non-zero if the handshake was actually - * started. On error zero is returned and errno is set to one of the - * following values: - * - * EAGAIN - There is already a handshake in progress for this address. - * EINVAL - IP or port are not valid. */ -int clusterStartHandshake(char *ip, int port, int cport) { - clusterNode *n; - char norm_ip[NET_IP_STR_LEN]; - struct sockaddr_storage sa; - - /* IP sanity check */ - if (inet_pton(AF_INET,ip, - &(((struct sockaddr_in *)&sa)->sin_addr))) - { - sa.ss_family = AF_INET; - } else if (inet_pton(AF_INET6,ip, - &(((struct sockaddr_in6 *)&sa)->sin6_addr))) - { - sa.ss_family = AF_INET6; - } else { - errno = EINVAL; - return 0; - } - - /* Port sanity check */ - if (port <= 0 || port > 65535 || cport <= 0 || cport > 65535) { - errno = EINVAL; - return 0; - } - - /* Set norm_ip as the normalized string representation of the node - * IP address. */ - memset(norm_ip,0,NET_IP_STR_LEN); - if (sa.ss_family == AF_INET) - inet_ntop(AF_INET, - (void*)&(((struct sockaddr_in *)&sa)->sin_addr), - norm_ip,NET_IP_STR_LEN); - else - inet_ntop(AF_INET6, - (void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr), - norm_ip,NET_IP_STR_LEN); - - if (clusterHandshakeInProgress(norm_ip,port,cport)) { - errno = EAGAIN; - return 0; - } - - /* Add the node with a random address (NULL as first argument to - * createClusterNode()). Everything will be fixed during the - * handshake. */ - n = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_MEET); - memcpy(n->ip,norm_ip,sizeof(n->ip)); - if (server.tls_cluster) { - n->tls_port = port; - } else { - n->tcp_port = port; - } - n->cport = cport; - clusterAddNode(n); - return 1; -} - -static void getClientPortFromClusterMsg(clusterMsg *hdr, int *tls_port, int *tcp_port) { - if (server.tls_cluster) { - *tls_port = ntohs(hdr->port); - *tcp_port = ntohs(hdr->pport); - } else { - *tls_port = ntohs(hdr->pport); - *tcp_port = ntohs(hdr->port); - } -} - -static void getClientPortFromGossip(clusterMsgDataGossip *g, int *tls_port, int *tcp_port) { - if (server.tls_cluster) { - *tls_port = ntohs(g->port); - *tcp_port = ntohs(g->pport); - } else { - *tls_port = ntohs(g->pport); - *tcp_port = ntohs(g->port); - } -} - -/* Process the gossip section of PING or PONG packets. - * Note that this function assumes that the packet is already sanity-checked - * by the caller, not in the content of the gossip section, but in the - * length. */ -void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { - uint16_t count = ntohs(hdr->count); - clusterMsgDataGossip *g = (clusterMsgDataGossip*) hdr->data.ping.gossip; - clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); - - while(count--) { - uint16_t flags = ntohs(g->flags); - clusterNode *node; - sds ci; - - if (server.verbosity == LL_DEBUG) { - ci = representClusterNodeFlags(sdsempty(), flags); - serverLog(LL_DEBUG,"GOSSIP %.40s %s:%d@%d %s", - g->nodename, - g->ip, - ntohs(g->port), - ntohs(g->cport), - ci); - sdsfree(ci); - } - - /* Convert port and pport into TCP port and TLS port. */ - int msg_tls_port, msg_tcp_port; - getClientPortFromGossip(g, &msg_tls_port, &msg_tcp_port); - - /* Update our state accordingly to the gossip sections */ - node = clusterLookupNode(g->nodename, CLUSTER_NAMELEN); - if (node) { - /* We already know this node. - Handle failure reports, only when the sender is a master. */ - if (sender && nodeIsMaster(sender) && node != myself) { - if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) { - if (clusterNodeAddFailureReport(node,sender)) { - serverLog(LL_VERBOSE, - "Node %.40s (%s) reported node %.40s (%s) as not reachable.", - sender->name, sender->human_nodename, node->name, node->human_nodename); - } - markNodeAsFailingIfNeeded(node); - } else { - if (clusterNodeDelFailureReport(node,sender)) { - serverLog(LL_VERBOSE, - "Node %.40s (%s) reported node %.40s (%s) is back online.", - sender->name, sender->human_nodename, node->name, node->human_nodename); - } - } - } - - /* If from our POV the node is up (no failure flags are set), - * we have no pending ping for the node, nor we have failure - * reports for this node, update the last pong time with the - * one we see from the other nodes. */ - if (!(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) && - node->ping_sent == 0 && - clusterNodeFailureReportsCount(node) == 0) - { - mstime_t pongtime = ntohl(g->pong_received); - pongtime *= 1000; /* Convert back to milliseconds. */ - - /* Replace the pong time with the received one only if - * it's greater than our view but is not in the future - * (with 500 milliseconds tolerance) from the POV of our - * clock. */ - if (pongtime <= (server.mstime+500) && - pongtime > node->pong_received) - { - node->pong_received = pongtime; - } - } - - /* If we already know this node, but it is not reachable, and - * we see a different address in the gossip section of a node that - * can talk with this other node, update the address, disconnect - * the old link if any, so that we'll attempt to connect with the - * new address. */ - if (node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL) && - !(flags & CLUSTER_NODE_NOADDR) && - !(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) && - (strcasecmp(node->ip,g->ip) || - node->tls_port != (server.tls_cluster ? ntohs(g->port) : ntohs(g->pport)) || - node->tcp_port != (server.tls_cluster ? ntohs(g->pport) : ntohs(g->port)) || - node->cport != ntohs(g->cport))) - { - if (node->link) freeClusterLink(node->link); - memcpy(node->ip,g->ip,NET_IP_STR_LEN); - node->tcp_port = msg_tcp_port; - node->tls_port = msg_tls_port; - node->cport = ntohs(g->cport); - node->flags &= ~CLUSTER_NODE_NOADDR; - } - } else { - /* If it's not in NOADDR state and we don't have it, we - * add it to our trusted dict with exact nodeid and flag. - * Note that we cannot simply start a handshake against - * this IP/PORT pairs, since IP/PORT can be reused already, - * otherwise we risk joining another cluster. - * - * Note that we require that the sender of this gossip message - * is a well known node in our cluster, otherwise we risk - * joining another cluster. */ - if (sender && - !(flags & CLUSTER_NODE_NOADDR) && - !clusterBlacklistExists(g->nodename)) - { - clusterNode *node; - node = createClusterNode(g->nodename, flags); - memcpy(node->ip,g->ip,NET_IP_STR_LEN); - node->tcp_port = msg_tcp_port; - node->tls_port = msg_tls_port; - node->cport = ntohs(g->cport); - clusterAddNode(node); - } - } - - /* Next node */ - g++; - } -} - -/* IP -> string conversion. 'buf' is supposed to at least be 46 bytes. - * If 'announced_ip' length is non-zero, it is used instead of extracting - * the IP from the socket peer address. */ -int nodeIp2String(char *buf, clusterLink *link, char *announced_ip) { - if (announced_ip[0] != '\0') { - memcpy(buf,announced_ip,NET_IP_STR_LEN); - buf[NET_IP_STR_LEN-1] = '\0'; /* We are not sure the input is sane. */ - return C_OK; - } else { - if (connAddrPeerName(link->conn, buf, NET_IP_STR_LEN, NULL) == -1) { - serverLog(LL_NOTICE, "Error converting peer IP to string: %s", - link->conn ? connGetLastError(link->conn) : "no link"); - return C_ERR; - } - return C_OK; - } -} - -/* Update the node address to the IP address that can be extracted - * from link->fd, or if hdr->myip is non empty, to the address the node - * is announcing us. The port is taken from the packet header as well. - * - * If the address or port changed, disconnect the node link so that we'll - * connect again to the new address. - * - * If the ip/port pair are already correct no operation is performed at - * all. - * - * The function returns 0 if the node address is still the same, - * otherwise 1 is returned. */ -int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, - clusterMsg *hdr) -{ - char ip[NET_IP_STR_LEN] = {0}; - int cport = ntohs(hdr->cport); - int tcp_port, tls_port; - getClientPortFromClusterMsg(hdr, &tls_port, &tcp_port); - - /* We don't proceed if the link is the same as the sender link, as this - * function is designed to see if the node link is consistent with the - * symmetric link that is used to receive PINGs from the node. - * - * As a side effect this function never frees the passed 'link', so - * it is safe to call during packet processing. */ - if (link == node->link) return 0; - - /* If the peer IP is unavailable for some reasons like invalid fd or closed - * link, just give up the update this time, and the update will be retried - * in the next round of PINGs */ - if (nodeIp2String(ip,link,hdr->myip) == C_ERR) return 0; - - if (node->tcp_port == tcp_port && node->cport == cport && node->tls_port == tls_port && - strcmp(ip,node->ip) == 0) return 0; - - /* IP / port is different, update it. */ - memcpy(node->ip,ip,sizeof(ip)); - node->tcp_port = tcp_port; - node->tls_port = tls_port; - node->cport = cport; - if (node->link) freeClusterLink(node->link); - node->flags &= ~CLUSTER_NODE_NOADDR; - serverLog(LL_NOTICE,"Address updated for node %.40s (%s), now %s:%d", - node->name, node->human_nodename, node->ip, getNodeDefaultClientPort(node)); - - /* Check if this is our master and we have to change the - * replication target as well. */ - if (nodeIsSlave(myself) && myself->slaveof == node) - replicationSetMaster(node->ip, getNodeDefaultReplicationPort(node)); - return 1; -} - -/* Reconfigure the specified node 'n' as a master. This function is called when - * a node that we believed to be a slave is now acting as master in order to - * update the state of the node. */ -void clusterSetNodeAsMaster(clusterNode *n) { - if (nodeIsMaster(n)) return; - - if (n->slaveof) { - clusterNodeRemoveSlave(n->slaveof,n); - if (n != myself) n->flags |= CLUSTER_NODE_MIGRATE_TO; - } - n->flags &= ~CLUSTER_NODE_SLAVE; - n->flags |= CLUSTER_NODE_MASTER; - n->slaveof = NULL; - - /* Update config and state. */ - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); -} - -/* This function is called when we receive a master configuration via a - * PING, PONG or UPDATE packet. What we receive is a node, a configEpoch of the - * node, and the set of slots claimed under this configEpoch. - * - * What we do is to rebind the slots with newer configuration compared to our - * local configuration, and if needed, we turn ourself into a replica of the - * node (see the function comments for more info). - * - * The 'sender' is the node for which we received a configuration update. - * Sometimes it is not actually the "Sender" of the information, like in the - * case we receive the info via an UPDATE packet. */ -void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, unsigned char *slots) { - int j; - clusterNode *curmaster = NULL, *newmaster = NULL; - /* The dirty slots list is a list of slots for which we lose the ownership - * while having still keys inside. This usually happens after a failover - * or after a manual cluster reconfiguration operated by the admin. - * - * If the update message is not able to demote a master to slave (in this - * case we'll resync with the master updating the whole key space), we - * need to delete all the keys in the slots we lost ownership. */ - uint16_t dirty_slots[CLUSTER_SLOTS]; - int dirty_slots_count = 0; - - /* We should detect if sender is new master of our shard. - * We will know it if all our slots were migrated to sender, and sender - * has no slots except ours */ - int sender_slots = 0; - int migrated_our_slots = 0; - - /* Here we set curmaster to this node or the node this node - * replicates to if it's a slave. In the for loop we are - * interested to check if slots are taken away from curmaster. */ - curmaster = nodeIsMaster(myself) ? myself : myself->slaveof; - - if (sender == myself) { - serverLog(LL_NOTICE,"Discarding UPDATE message about myself."); - return; - } - - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (bitmapTestBit(slots,j)) { - sender_slots++; - - /* The slot is already bound to the sender of this message. */ - if (server.cluster->slots[j] == sender) { - bitmapClearBit(server.cluster->owner_not_claiming_slot, j); - continue; - } - - /* The slot is in importing state, it should be modified only - * manually via redis-cli (example: a resharding is in progress - * and the migrating side slot was already closed and is advertising - * a new config. We still want the slot to be closed manually). */ - if (server.cluster->importing_slots_from[j]) continue; - - /* We rebind the slot to the new node claiming it if: - * 1) The slot was unassigned or the previous owner no longer owns the slot or - * the new node claims it with a greater configEpoch. - * 2) We are not currently importing the slot. */ - if (isSlotUnclaimed(j) || - server.cluster->slots[j]->configEpoch < senderConfigEpoch) - { - /* Was this slot mine, and still contains keys? Mark it as - * a dirty slot. */ - if (server.cluster->slots[j] == myself && - countKeysInSlot(j) && - sender != myself) - { - dirty_slots[dirty_slots_count] = j; - dirty_slots_count++; - } - - if (server.cluster->slots[j] == curmaster) { - newmaster = sender; - migrated_our_slots++; - } - clusterDelSlot(j); - clusterAddSlot(sender,j); - bitmapClearBit(server.cluster->owner_not_claiming_slot, j); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); - } - } else if (server.cluster->slots[j] == sender) { - /* The slot is currently bound to the sender but the sender is no longer - * claiming it. We don't want to unbind the slot yet as it can cause the cluster - * to move to FAIL state and also throw client error. Keeping the slot bound to - * the previous owner will cause a few client side redirects, but won't throw - * any errors. We will keep track of the uncertainty in ownership to avoid - * propagating misinformation about this slot's ownership using UPDATE - * messages. */ - bitmapSetBit(server.cluster->owner_not_claiming_slot, j); - } - } - - /* After updating the slots configuration, don't do any actual change - * in the state of the server if a module disabled Redis Cluster - * keys redirections. */ - if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) - return; - - /* If at least one slot was reassigned from a node to another node - * with a greater configEpoch, it is possible that: - * 1) We are a master left without slots. This means that we were - * failed over and we should turn into a replica of the new - * master. - * 2) We are a slave and our master is left without slots. We need - * to replicate to the new slots owner. */ - if (newmaster && curmaster->numslots == 0 && - (server.cluster_allow_replica_migration || - sender_slots == migrated_our_slots)) { - serverLog(LL_NOTICE, - "Configuration change detected. Reconfiguring myself " - "as a replica of %.40s (%s)", sender->name, sender->human_nodename); - clusterSetMaster(sender); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); - } else if (myself->slaveof && myself->slaveof->slaveof && - /* In some rare case when CLUSTER FAILOVER TAKEOVER is used, it - * can happen that myself is a replica of a replica of myself. If - * this happens, we do nothing to avoid a crash and wait for the - * admin to repair the cluster. */ - myself->slaveof->slaveof != myself) - { - /* Safeguard against sub-replicas. A replica's master can turn itself - * into a replica if its last slot is removed. If no other node takes - * over the slot, there is nothing else to trigger replica migration. */ - serverLog(LL_NOTICE, - "I'm a sub-replica! Reconfiguring myself as a replica of grandmaster %.40s (%s)", - myself->slaveof->slaveof->name, myself->slaveof->slaveof->human_nodename); - clusterSetMaster(myself->slaveof->slaveof); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); - } else if (dirty_slots_count) { - /* If we are here, we received an update message which removed - * ownership for certain slots we still have keys about, but still - * we are serving some slots, so this master node was not demoted to - * a slave. - * - * In order to maintain a consistent state between keys and slots - * we need to remove all the keys from the slots we lost. */ - for (j = 0; j < dirty_slots_count; j++) - delKeysInSlot(dirty_slots[j]); - } -} - -/* Cluster ping extensions. - * - * The ping/pong/meet messages support arbitrary extensions to add additional - * metadata to the messages that are sent between the various nodes in the - * cluster. The extensions take the form: - * [ Header length + type (8 bytes) ] - * [ Extension information (Arbitrary length, but must be 8 byte padded) ] - */ - - -/* Returns the length of a given extension */ -static uint32_t getPingExtLength(clusterMsgPingExt *ext) { - return ntohl(ext->length); -} - -/* Returns the initial position of ping extensions. May return an invalid - * address if there are no ping extensions. */ -static clusterMsgPingExt *getInitialPingExt(clusterMsg *hdr, int count) { - clusterMsgPingExt *initial = (clusterMsgPingExt*) &(hdr->data.ping.gossip[count]); - return initial; -} - -/* Given a current ping extension, returns the start of the next extension. May return - * an invalid address if there are no further ping extensions. */ -static clusterMsgPingExt *getNextPingExt(clusterMsgPingExt *ext) { - clusterMsgPingExt *next = (clusterMsgPingExt *) (((char *) ext) + getPingExtLength(ext)); - return next; -} - -/* All PING extensions must be 8-byte aligned */ -uint32_t getAlignedPingExtSize(uint32_t dataSize) { - - return sizeof(clusterMsgPingExt) + EIGHT_BYTE_ALIGN(dataSize); -} - -uint32_t getHostnamePingExtSize(void) { - if (sdslen(myself->hostname) == 0) { - return 0; - } - return getAlignedPingExtSize(sdslen(myself->hostname) + 1); -} - -uint32_t getHumanNodenamePingExtSize(void) { - if (sdslen(myself->human_nodename) == 0) { - return 0; - } - return getAlignedPingExtSize(sdslen(myself->human_nodename) + 1); -} - -uint32_t getShardIdPingExtSize(void) { - return getAlignedPingExtSize(sizeof(clusterMsgPingExtShardId)); -} - -uint32_t getForgottenNodeExtSize(void) { - return getAlignedPingExtSize(sizeof(clusterMsgPingExtForgottenNode)); -} - -void *preparePingExt(clusterMsgPingExt *ext, uint16_t type, uint32_t length) { - ext->type = htons(type); - ext->length = htonl(length); - return &ext->ext[0]; -} - -clusterMsgPingExt *nextPingExt(clusterMsgPingExt *ext) { - return (clusterMsgPingExt *)((char*)ext + ntohl(ext->length)); -} - -/* 1. If a NULL hdr is provided, compute the extension size; - * 2. If a non-NULL hdr is provided, write the hostname ping - * extension at the start of the cursor. This function - * will update the cursor to point to the end of the - * written extension and will return the amount of bytes - * written. */ -uint32_t writePingExt(clusterMsg *hdr, int gossipcount) { - uint16_t extensions = 0; - uint32_t totlen = 0; - clusterMsgPingExt *cursor = NULL; - /* Set the initial extension position */ - if (hdr != NULL) { - cursor = getInitialPingExt(hdr, gossipcount); - } - - /* hostname is optional */ - if (sdslen(myself->hostname) != 0) { - if (cursor != NULL) { - /* Populate hostname */ - clusterMsgPingExtHostname *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HOSTNAME, getHostnamePingExtSize()); - memcpy(ext->hostname, myself->hostname, sdslen(myself->hostname)); - - /* Move the write cursor */ - cursor = nextPingExt(cursor); - } - - totlen += getHostnamePingExtSize(); - extensions++; - } - - if (sdslen(myself->human_nodename) != 0) { - if (cursor != NULL) { - /* Populate human_nodename */ - clusterMsgPingExtHumanNodename *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, getHumanNodenamePingExtSize()); - memcpy(ext->human_nodename, myself->human_nodename, sdslen(myself->human_nodename)); - - /* Move the write cursor */ - cursor = nextPingExt(cursor); - } - - totlen += getHumanNodenamePingExtSize(); - extensions++; - } - - /* Gossip forgotten nodes */ - if (dictSize(server.cluster->nodes_black_list) > 0) { - dictIterator *di = dictGetIterator(server.cluster->nodes_black_list); - dictEntry *de; - while ((de = dictNext(di)) != NULL) { - if (cursor != NULL) { - uint64_t expire = dictGetUnsignedIntegerVal(de); - if ((time_t)expire < server.unixtime) continue; /* already expired */ - uint64_t ttl = expire - server.unixtime; - clusterMsgPingExtForgottenNode *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, getForgottenNodeExtSize()); - memcpy(ext->name, dictGetKey(de), CLUSTER_NAMELEN); - ext->ttl = htonu64(ttl); - - /* Move the write cursor */ - cursor = nextPingExt(cursor); - } - totlen += getForgottenNodeExtSize(); - extensions++; - } - dictReleaseIterator(di); - } - - /* Populate shard_id */ - if (cursor != NULL) { - clusterMsgPingExtShardId *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_SHARDID, getShardIdPingExtSize()); - memcpy(ext->shard_id, myself->shard_id, CLUSTER_NAMELEN); - - /* Move the write cursor */ - cursor = nextPingExt(cursor); - } - totlen += getShardIdPingExtSize(); - extensions++; - - if (hdr != NULL) { - if (extensions != 0) { - hdr->mflags[0] |= CLUSTERMSG_FLAG0_EXT_DATA; - } - hdr->extensions = htons(extensions); - } - - return totlen; -} - -/* We previously validated the extensions, so this function just needs to - * handle the extensions. */ -void clusterProcessPingExtensions(clusterMsg *hdr, clusterLink *link) { - clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); - char *ext_hostname = NULL; - char *ext_humannodename = NULL; - char *ext_shardid = NULL; - uint16_t extensions = ntohs(hdr->extensions); - /* Loop through all the extensions and process them */ - clusterMsgPingExt *ext = getInitialPingExt(hdr, ntohs(hdr->count)); - while (extensions--) { - uint16_t type = ntohs(ext->type); - if (type == CLUSTERMSG_EXT_TYPE_HOSTNAME) { - clusterMsgPingExtHostname *hostname_ext = (clusterMsgPingExtHostname *) &(ext->ext[0].hostname); - ext_hostname = hostname_ext->hostname; - } else if (type == CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME) { - clusterMsgPingExtHumanNodename *humannodename_ext = (clusterMsgPingExtHumanNodename *) &(ext->ext[0].human_nodename); - ext_humannodename = humannodename_ext->human_nodename; - } else if (type == CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE) { - clusterMsgPingExtForgottenNode *forgotten_node_ext = &(ext->ext[0].forgotten_node); - clusterNode *n = clusterLookupNode(forgotten_node_ext->name, CLUSTER_NAMELEN); - if (n && n != myself && !(nodeIsSlave(myself) && myself->slaveof == n)) { - sds id = sdsnewlen(forgotten_node_ext->name, CLUSTER_NAMELEN); - dictEntry *de = dictAddRaw(server.cluster->nodes_black_list, id, NULL); - serverAssert(de != NULL); - uint64_t expire = server.unixtime + ntohu64(forgotten_node_ext->ttl); - dictSetUnsignedIntegerVal(de, expire); - clusterDelNode(n); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_SAVE_CONFIG); - } - } else if (type == CLUSTERMSG_EXT_TYPE_SHARDID) { - clusterMsgPingExtShardId *shardid_ext = (clusterMsgPingExtShardId *) &(ext->ext[0].shard_id); - ext_shardid = shardid_ext->shard_id; - } else { - /* Unknown type, we will ignore it but log what happened. */ - serverLog(LL_WARNING, "Received unknown extension type %d", type); - } - - /* We know this will be valid since we validated it ahead of time */ - ext = getNextPingExt(ext); - } - /* If the node did not send us a hostname extension, assume - * they don't have an announced hostname. Otherwise, we'll - * set it now. */ - updateAnnouncedHostname(sender, ext_hostname); - updateAnnouncedHumanNodename(sender, ext_humannodename); - updateShardId(sender, ext_shardid); -} - -static clusterNode *getNodeFromLinkAndMsg(clusterLink *link, clusterMsg *hdr) { - clusterNode *sender; - if (link->node && !nodeInHandshake(link->node)) { - /* If the link has an associated node, use that so that we don't have to look it - * up every time, except when the node is still in handshake, the node still has - * a random name thus not truly "known". */ - sender = link->node; - } else { - /* Otherwise, fetch sender based on the message */ - sender = clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); - /* We know the sender node but haven't associate it with the link. This must - * be an inbound link because only for inbound links we didn't know which node - * to associate when they were created. */ - if (sender && !link->node) { - setClusterNodeToInboundClusterLink(sender, link); - } - } - return sender; -} - -/* When this function is called, there is a packet to process starting - * at link->rcvbuf. Releasing the buffer is up to the caller, so this - * function should just handle the higher level stuff of processing the - * packet, modifying the cluster state if needed. - * - * The function returns 1 if the link is still valid after the packet - * was processed, otherwise 0 if the link was freed since the packet - * processing lead to some inconsistency error (for instance a PONG - * received from the wrong sender ID). */ -int clusterProcessPacket(clusterLink *link) { - clusterMsg *hdr = (clusterMsg*) link->rcvbuf; - uint32_t totlen = ntohl(hdr->totlen); - uint16_t type = ntohs(hdr->type); - mstime_t now = mstime(); - - if (type < CLUSTERMSG_TYPE_COUNT) - server.cluster->stats_bus_messages_received[type]++; - serverLog(LL_DEBUG,"--- Processing packet of type %s, %lu bytes", - clusterGetMessageTypeString(type), (unsigned long) totlen); - - /* Perform sanity checks */ - if (totlen < 16) return 1; /* At least signature, version, totlen, count. */ - if (totlen > link->rcvbuf_len) return 1; - - if (ntohs(hdr->ver) != CLUSTER_PROTO_VER) { - /* Can't handle messages of different versions. */ - return 1; - } - - if (type == server.cluster_drop_packet_filter) { - serverLog(LL_WARNING, "Dropping packet that matches debug drop filter"); - return 1; - } - - uint16_t flags = ntohs(hdr->flags); - uint16_t extensions = ntohs(hdr->extensions); - uint64_t senderCurrentEpoch = 0, senderConfigEpoch = 0; - uint32_t explen; /* expected length of this packet */ - clusterNode *sender; - - if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || - type == CLUSTERMSG_TYPE_MEET) - { - uint16_t count = ntohs(hdr->count); - - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - explen += (sizeof(clusterMsgDataGossip)*count); - - /* If there is extension data, which doesn't have a fixed length, - * loop through them and validate the length of it now. */ - if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { - clusterMsgPingExt *ext = getInitialPingExt(hdr, count); - while (extensions--) { - uint16_t extlen = getPingExtLength(ext); - if (extlen % 8 != 0) { - serverLog(LL_WARNING, "Received a %s packet without proper padding (%d bytes)", - clusterGetMessageTypeString(type), (int) extlen); - return 1; - } - if ((totlen - explen) < extlen) { - serverLog(LL_WARNING, "Received invalid %s packet with extension data that exceeds " - "total packet length (%lld)", clusterGetMessageTypeString(type), - (unsigned long long) totlen); - return 1; - } - explen += extlen; - ext = getNextPingExt(ext); - } - } - } else if (type == CLUSTERMSG_TYPE_FAIL) { - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - explen += sizeof(clusterMsgDataFail); - } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) { - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - explen += sizeof(clusterMsgDataPublish) - - 8 + - ntohl(hdr->data.publish.msg.channel_len) + - ntohl(hdr->data.publish.msg.message_len); - } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST || - type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK || - type == CLUSTERMSG_TYPE_MFSTART) - { - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - } else if (type == CLUSTERMSG_TYPE_UPDATE) { - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - explen += sizeof(clusterMsgDataUpdate); - } else if (type == CLUSTERMSG_TYPE_MODULE) { - explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - explen += sizeof(clusterMsgModule) - - 3 + ntohl(hdr->data.module.msg.len); - } else { - /* We don't know this type of packet, so we assume it's well formed. */ - explen = totlen; - } - - if (totlen != explen) { - serverLog(LL_WARNING, "Received invalid %s packet of length %lld but expected length %lld", - clusterGetMessageTypeString(type), (unsigned long long) totlen, (unsigned long long) explen); - return 1; - } - - sender = getNodeFromLinkAndMsg(link, hdr); - - /* Update the last time we saw any data from this node. We - * use this in order to avoid detecting a timeout from a node that - * is just sending a lot of data in the cluster bus, for instance - * because of Pub/Sub. */ - if (sender) sender->data_received = now; - - if (sender && !nodeInHandshake(sender)) { - /* Update our currentEpoch if we see a newer epoch in the cluster. */ - senderCurrentEpoch = ntohu64(hdr->currentEpoch); - senderConfigEpoch = ntohu64(hdr->configEpoch); - if (senderCurrentEpoch > server.cluster->currentEpoch) - server.cluster->currentEpoch = senderCurrentEpoch; - /* Update the sender configEpoch if it is publishing a newer one. */ - if (senderConfigEpoch > sender->configEpoch) { - sender->configEpoch = senderConfigEpoch; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_FSYNC_CONFIG); - } - /* Update the replication offset info for this node. */ - sender->repl_offset = ntohu64(hdr->offset); - sender->repl_offset_time = now; - /* If we are a slave performing a manual failover and our master - * sent its offset while already paused, populate the MF state. */ - if (server.cluster->mf_end && - nodeIsSlave(myself) && - myself->slaveof == sender && - hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED && - server.cluster->mf_master_offset == -1) - { - server.cluster->mf_master_offset = sender->repl_offset; - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); - serverLog(LL_NOTICE, - "Received replication offset for paused " - "master manual failover: %lld", - server.cluster->mf_master_offset); - } - } - - /* Initial processing of PING and MEET requests replying with a PONG. */ - if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) { - /* We use incoming MEET messages in order to set the address - * for 'myself', since only other cluster nodes will send us - * MEET messages on handshakes, when the cluster joins, or - * later if we changed address, and those nodes will use our - * official address to connect to us. So by obtaining this address - * from the socket is a simple way to discover / update our own - * address in the cluster without it being hardcoded in the config. - * - * However if we don't have an address at all, we update the address - * even with a normal PING packet. If it's wrong it will be fixed - * by MEET later. */ - if ((type == CLUSTERMSG_TYPE_MEET || myself->ip[0] == '\0') && - server.cluster_announce_ip == NULL) - { - char ip[NET_IP_STR_LEN]; - - if (connAddrSockName(link->conn,ip,sizeof(ip),NULL) != -1 && - strcmp(ip,myself->ip)) - { - memcpy(myself->ip,ip,NET_IP_STR_LEN); - serverLog(LL_NOTICE,"IP address for this node updated to %s", - myself->ip); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - } - } - - /* Add this node if it is new for us and the msg type is MEET. - * In this stage we don't try to add the node with the right - * flags, slaveof pointer, and so forth, as this details will be - * resolved when we'll receive PONGs from the node. */ - if (!sender && type == CLUSTERMSG_TYPE_MEET) { - clusterNode *node; - - node = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE); - serverAssert(nodeIp2String(node->ip,link,hdr->myip) == C_OK); - getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); - node->cport = ntohs(hdr->cport); - clusterAddNode(node); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - } - - /* If this is a MEET packet from an unknown node, we still process - * the gossip section here since we have to trust the sender because - * of the message type. */ - if (!sender && type == CLUSTERMSG_TYPE_MEET) - clusterProcessGossipSection(hdr,link); - - /* Anyway reply with a PONG */ - clusterSendPing(link,CLUSTERMSG_TYPE_PONG); - } - - /* PING, PONG, MEET: process config information. */ - if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || - type == CLUSTERMSG_TYPE_MEET) - { - serverLog(LL_DEBUG,"%s packet received: %.40s", - clusterGetMessageTypeString(type), - link->node ? link->node->name : "NULL"); - if (!link->inbound) { - if (nodeInHandshake(link->node)) { - /* If we already have this node, try to change the - * IP/port of the node with the new one. */ - if (sender) { - serverLog(LL_VERBOSE, - "Handshake: we already know node %.40s (%s), " - "updating the address if needed.", sender->name, sender->human_nodename); - if (nodeUpdateAddressIfNeeded(sender,link,hdr)) - { - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } - /* Free this node as we already have it. This will - * cause the link to be freed as well. */ - clusterDelNode(link->node); - return 0; - } - - /* First thing to do is replacing the random name with the - * right node name if this was a handshake stage. */ - clusterRenameNode(link->node, hdr->sender); - serverLog(LL_DEBUG,"Handshake with node %.40s completed.", - link->node->name); - link->node->flags &= ~CLUSTER_NODE_HANDSHAKE; - link->node->flags |= flags&(CLUSTER_NODE_MASTER|CLUSTER_NODE_SLAVE); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - } else if (memcmp(link->node->name,hdr->sender, - CLUSTER_NAMELEN) != 0) - { - /* If the reply has a non matching node ID we - * disconnect this node and set it as not having an associated - * address. */ - serverLog(LL_DEBUG,"PONG contains mismatching sender ID. About node %.40s added %d ms ago, having flags %d", - link->node->name, - (int)(now-(link->node->ctime)), - link->node->flags); - link->node->flags |= CLUSTER_NODE_NOADDR; - link->node->ip[0] = '\0'; - link->node->tcp_port = 0; - link->node->tls_port = 0; - link->node->cport = 0; - freeClusterLink(link); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - return 0; - } - } - - /* Copy the CLUSTER_NODE_NOFAILOVER flag from what the sender - * announced. This is a dynamic flag that we receive from the - * sender, and the latest status must be trusted. We need it to - * be propagated because the slave ranking used to understand the - * delay of each slave in the voting process, needs to know - * what are the instances really competing. */ - if (sender) { - int nofailover = flags & CLUSTER_NODE_NOFAILOVER; - sender->flags &= ~CLUSTER_NODE_NOFAILOVER; - sender->flags |= nofailover; - } - - /* Update the node address if it changed. */ - if (sender && type == CLUSTERMSG_TYPE_PING && - !nodeInHandshake(sender) && - nodeUpdateAddressIfNeeded(sender,link,hdr)) - { - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } - - /* Update our info about the node */ - if (!link->inbound && type == CLUSTERMSG_TYPE_PONG) { - link->node->pong_received = now; - link->node->ping_sent = 0; - - /* The PFAIL condition can be reversed without external - * help if it is momentary (that is, if it does not - * turn into a FAIL state). - * - * The FAIL condition is also reversible under specific - * conditions detected by clearNodeFailureIfNeeded(). */ - if (nodeTimedOut(link->node)) { - link->node->flags &= ~CLUSTER_NODE_PFAIL; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } else if (nodeFailed(link->node)) { - clearNodeFailureIfNeeded(link->node); - } - } - - /* Check for role switch: slave -> master or master -> slave. */ - if (sender) { - if (!memcmp(hdr->slaveof,CLUSTER_NODE_NULL_NAME, - sizeof(hdr->slaveof))) - { - /* Node is a master. */ - clusterSetNodeAsMaster(sender); - } else { - /* Node is a slave. */ - clusterNode *master = clusterLookupNode(hdr->slaveof, CLUSTER_NAMELEN); - - if (nodeIsMaster(sender)) { - /* Master turned into a slave! Reconfigure the node. */ - clusterDelNodeSlots(sender); - sender->flags &= ~(CLUSTER_NODE_MASTER| - CLUSTER_NODE_MIGRATE_TO); - sender->flags |= CLUSTER_NODE_SLAVE; - - /* Update config and state. */ - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } - - /* Master node changed for this slave? */ - if (master && sender->slaveof != master) { - if (sender->slaveof) - clusterNodeRemoveSlave(sender->slaveof,sender); - clusterNodeAddSlave(master,sender); - sender->slaveof = master; - - /* Update config. */ - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); - } - } - } - - /* Update our info about served slots. - * - * Note: this MUST happen after we update the master/slave state - * so that CLUSTER_NODE_MASTER flag will be set. */ - - /* Many checks are only needed if the set of served slots this - * instance claims is different compared to the set of slots we have - * for it. Check this ASAP to avoid other computational expansive - * checks later. */ - clusterNode *sender_master = NULL; /* Sender or its master if slave. */ - int dirty_slots = 0; /* Sender claimed slots don't match my view? */ - - if (sender) { - sender_master = nodeIsMaster(sender) ? sender : sender->slaveof; - if (sender_master) { - dirty_slots = memcmp(sender_master->slots, - hdr->myslots,sizeof(hdr->myslots)) != 0; - } - } - - /* 1) If the sender of the message is a master, and we detected that - * the set of slots it claims changed, scan the slots to see if we - * need to update our configuration. */ - if (sender && nodeIsMaster(sender) && dirty_slots) - clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots); - - /* 2) We also check for the reverse condition, that is, the sender - * claims to serve slots we know are served by a master with a - * greater configEpoch. If this happens we inform the sender. - * - * This is useful because sometimes after a partition heals, a - * reappearing master may be the last one to claim a given set of - * hash slots, but with a configuration that other instances know to - * be deprecated. Example: - * - * A and B are master and slave for slots 1,2,3. - * A is partitioned away, B gets promoted. - * B is partitioned away, and A returns available. - * - * Usually B would PING A publishing its set of served slots and its - * configEpoch, but because of the partition B can't inform A of the - * new configuration, so other nodes that have an updated table must - * do it. In this way A will stop to act as a master (or can try to - * failover if there are the conditions to win the election). */ - if (sender && dirty_slots) { - int j; - - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (bitmapTestBit(hdr->myslots,j)) { - if (server.cluster->slots[j] == sender || - isSlotUnclaimed(j)) continue; - if (server.cluster->slots[j]->configEpoch > - senderConfigEpoch) - { - serverLog(LL_VERBOSE, - "Node %.40s has old slots configuration, sending " - "an UPDATE message about %.40s", - sender->name, server.cluster->slots[j]->name); - clusterSendUpdate(sender->link, - server.cluster->slots[j]); - - /* TODO: instead of exiting the loop send every other - * UPDATE packet for other nodes that are the new owner - * of sender's slots. */ - break; - } - } - } - } - - /* If our config epoch collides with the sender's try to fix - * the problem. */ - if (sender && - nodeIsMaster(myself) && nodeIsMaster(sender) && - senderConfigEpoch == myself->configEpoch) - { - clusterHandleConfigEpochCollision(sender); - } - - /* Get info from the gossip section */ - if (sender) { - clusterProcessGossipSection(hdr,link); - clusterProcessPingExtensions(hdr,link); - } - } else if (type == CLUSTERMSG_TYPE_FAIL) { - clusterNode *failing; - - if (sender) { - failing = clusterLookupNode(hdr->data.fail.about.nodename, CLUSTER_NAMELEN); - if (failing && - !(failing->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_MYSELF))) - { - serverLog(LL_NOTICE, - "FAIL message received from %.40s (%s) about %.40s (%s)", - hdr->sender, sender->human_nodename, hdr->data.fail.about.nodename, failing->human_nodename); - failing->flags |= CLUSTER_NODE_FAIL; - failing->fail_time = now; - failing->flags &= ~CLUSTER_NODE_PFAIL; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE); - } - } else { - serverLog(LL_NOTICE, - "Ignoring FAIL message from unknown node %.40s about %.40s", - hdr->sender, hdr->data.fail.about.nodename); - } - } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) { - if (!sender) return 1; /* We don't know that node. */ - - robj *channel, *message; - uint32_t channel_len, message_len; - - /* Don't bother creating useless objects if there are no - * Pub/Sub subscribers. */ - if ((type == CLUSTERMSG_TYPE_PUBLISH - && serverPubsubSubscriptionCount() > 0) - || (type == CLUSTERMSG_TYPE_PUBLISHSHARD - && serverPubsubShardSubscriptionCount() > 0)) - { - channel_len = ntohl(hdr->data.publish.msg.channel_len); - message_len = ntohl(hdr->data.publish.msg.message_len); - channel = createStringObject( - (char*)hdr->data.publish.msg.bulk_data,channel_len); - message = createStringObject( - (char*)hdr->data.publish.msg.bulk_data+channel_len, - message_len); - pubsubPublishMessage(channel, message, type == CLUSTERMSG_TYPE_PUBLISHSHARD); - decrRefCount(channel); - decrRefCount(message); - } - } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) { - if (!sender) return 1; /* We don't know that node. */ - clusterSendFailoverAuthIfNeeded(sender,hdr); - } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) { - if (!sender) return 1; /* We don't know that node. */ - /* We consider this vote only if the sender is a master serving - * a non zero number of slots, and its currentEpoch is greater or - * equal to epoch where this node started the election. */ - if (nodeIsMaster(sender) && sender->numslots > 0 && - senderCurrentEpoch >= server.cluster->failover_auth_epoch) - { - server.cluster->failover_auth_count++; - /* Maybe we reached a quorum here, set a flag to make sure - * we check ASAP. */ - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); - } - } else if (type == CLUSTERMSG_TYPE_MFSTART) { - /* This message is acceptable only if I'm a master and the sender - * is one of my slaves. */ - if (!sender || sender->slaveof != myself) return 1; - /* Manual failover requested from slaves. Initialize the state - * accordingly. */ - resetManualFailover(); - server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT; - server.cluster->mf_slave = sender; - pauseActions(PAUSE_DURING_FAILOVER, - now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT), - PAUSE_ACTIONS_CLIENT_WRITE_SET); - serverLog(LL_NOTICE,"Manual failover requested by replica %.40s (%s).", - sender->name, sender->human_nodename); - /* We need to send a ping message to the replica, as it would carry - * `server.cluster->mf_master_offset`, which means the master paused clients - * at offset `server.cluster->mf_master_offset`, so that the replica would - * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as - * to complete failover as quickly as possible. */ - clusterSendPing(link, CLUSTERMSG_TYPE_PING); - } else if (type == CLUSTERMSG_TYPE_UPDATE) { - clusterNode *n; /* The node the update is about. */ - uint64_t reportedConfigEpoch = - ntohu64(hdr->data.update.nodecfg.configEpoch); - - if (!sender) return 1; /* We don't know the sender. */ - n = clusterLookupNode(hdr->data.update.nodecfg.nodename, CLUSTER_NAMELEN); - if (!n) return 1; /* We don't know the reported node. */ - if (n->configEpoch >= reportedConfigEpoch) return 1; /* Nothing new. */ - - /* If in our current config the node is a slave, set it as a master. */ - if (nodeIsSlave(n)) clusterSetNodeAsMaster(n); - - /* Update the node's configEpoch. */ - n->configEpoch = reportedConfigEpoch; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_FSYNC_CONFIG); - - /* Check the bitmap of served slots and update our - * config accordingly. */ - clusterUpdateSlotsConfigWith(n,reportedConfigEpoch, - hdr->data.update.nodecfg.slots); - } else if (type == CLUSTERMSG_TYPE_MODULE) { - if (!sender) return 1; /* Protect the module from unknown nodes. */ - /* We need to route this message back to the right module subscribed - * for the right message type. */ - uint64_t module_id = hdr->data.module.msg.module_id; /* Endian-safe ID */ - uint32_t len = ntohl(hdr->data.module.msg.len); - uint8_t type = hdr->data.module.msg.type; - unsigned char *payload = hdr->data.module.msg.bulk_data; - moduleCallClusterReceivers(sender->name,module_id,type,payload,len); - } else { - serverLog(LL_WARNING,"Received unknown packet type: %d", type); - } - return 1; -} - -/* This function is called when we detect the link with this node is lost. - We set the node as no longer connected. The Cluster Cron will detect - this connection and will try to get it connected again. - - Instead if the node is a temporary node used to accept a query, we - completely free the node on error. */ -void handleLinkIOError(clusterLink *link) { - freeClusterLink(link); -} - -/* Send the messages queued for the link. */ -void clusterWriteHandler(connection *conn) { - clusterLink *link = connGetPrivateData(conn); - ssize_t nwritten; - size_t totwritten = 0; - - while (totwritten < NET_MAX_WRITES_PER_EVENT && listLength(link->send_msg_queue) > 0) { - listNode *head = listFirst(link->send_msg_queue); - clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)head->value; - clusterMsg *msg = &msgblock->msg; - size_t msg_offset = link->head_msg_send_offset; - size_t msg_len = ntohl(msg->totlen); - - nwritten = connWrite(conn, (char*)msg + msg_offset, msg_len - msg_offset); - if (nwritten <= 0) { - serverLog(LL_DEBUG,"I/O error writing to node link: %s", - (nwritten == -1) ? connGetLastError(conn) : "short write"); - handleLinkIOError(link); - return; - } - if (msg_offset + nwritten < msg_len) { - /* If full message wasn't written, record the offset - * and continue sending from this point next time */ - link->head_msg_send_offset += nwritten; - return; - } - serverAssert((msg_offset + nwritten) == msg_len); - link->head_msg_send_offset = 0; - - /* Delete the node and update our memory tracking */ - uint32_t blocklen = msgblock->totlen; - listDelNode(link->send_msg_queue, head); - server.stat_cluster_links_memory -= sizeof(listNode); - link->send_msg_queue_mem -= sizeof(listNode) + blocklen; - - totwritten += nwritten; - } - - if (listLength(link->send_msg_queue) == 0) - connSetWriteHandler(link->conn, NULL); -} - -/* A connect handler that gets called when a connection to another node - * gets established. - */ -void clusterLinkConnectHandler(connection *conn) { - clusterLink *link = connGetPrivateData(conn); - clusterNode *node = link->node; - - /* Check if connection succeeded */ - if (connGetState(conn) != CONN_STATE_CONNECTED) { - serverLog(LL_VERBOSE, "Connection with Node %.40s at %s:%d failed: %s", - node->name, node->ip, node->cport, - connGetLastError(conn)); - freeClusterLink(link); - return; - } - - /* Register a read handler from now on */ - connSetReadHandler(conn, clusterReadHandler); - - /* Queue a PING in the new connection ASAP: this is crucial - * to avoid false positives in failure detection. - * - * If the node is flagged as MEET, we send a MEET message instead - * of a PING one, to force the receiver to add us in its node - * table. */ - mstime_t old_ping_sent = node->ping_sent; - clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ? - CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); - if (old_ping_sent) { - /* If there was an active ping before the link was - * disconnected, we want to restore the ping time, otherwise - * replaced by the clusterSendPing() call. */ - node->ping_sent = old_ping_sent; - } - /* We can clear the flag after the first packet is sent. - * If we'll never receive a PONG, we'll never send new packets - * to this node. Instead after the PONG is received and we - * are no longer in meet/handshake status, we want to send - * normal PING packets. */ - node->flags &= ~CLUSTER_NODE_MEET; - - serverLog(LL_DEBUG,"Connecting with Node %.40s at %s:%d", - node->name, node->ip, node->cport); -} - -/* Read data. Try to read the first field of the header first to check the - * full length of the packet. When a whole packet is in memory this function - * will call the function to process the packet. And so forth. */ -void clusterReadHandler(connection *conn) { - clusterMsg buf[1]; - ssize_t nread; - clusterMsg *hdr; - clusterLink *link = connGetPrivateData(conn); - unsigned int readlen, rcvbuflen; - - while(1) { /* Read as long as there is data to read. */ - rcvbuflen = link->rcvbuf_len; - if (rcvbuflen < 8) { - /* First, obtain the first 8 bytes to get the full message - * length. */ - readlen = 8 - rcvbuflen; - } else { - /* Finally read the full message. */ - hdr = (clusterMsg*) link->rcvbuf; - if (rcvbuflen == 8) { - /* Perform some sanity check on the message signature - * and length. */ - if (memcmp(hdr->sig,"RCmb",4) != 0 || - ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN) - { - char ip[NET_IP_STR_LEN]; - int port; - if (connAddrPeerName(conn, ip, sizeof(ip), &port) == -1) { - serverLog(LL_WARNING, - "Bad message length or signature received " - "on the Cluster bus."); - } else { - serverLog(LL_WARNING, - "Bad message length or signature received " - "on the Cluster bus from %s:%d", ip, port); - } - handleLinkIOError(link); - return; - } - } - readlen = ntohl(hdr->totlen) - rcvbuflen; - if (readlen > sizeof(buf)) readlen = sizeof(buf); - } - - nread = connRead(conn,buf,readlen); - if (nread == -1 && (connGetState(conn) == CONN_STATE_CONNECTED)) return; /* No more data ready. */ - - if (nread <= 0) { - /* I/O error... */ - serverLog(LL_DEBUG,"I/O error reading from node link: %s", - (nread == 0) ? "connection closed" : connGetLastError(conn)); - handleLinkIOError(link); - return; - } else { - /* Read data and recast the pointer to the new buffer. */ - size_t unused = link->rcvbuf_alloc - link->rcvbuf_len; - if ((size_t)nread > unused) { - size_t required = link->rcvbuf_len + nread; - size_t prev_rcvbuf_alloc = link->rcvbuf_alloc; - /* If less than 1mb, grow to twice the needed size, if larger grow by 1mb. */ - link->rcvbuf_alloc = required < RCVBUF_MAX_PREALLOC ? required * 2: required + RCVBUF_MAX_PREALLOC; - link->rcvbuf = zrealloc(link->rcvbuf, link->rcvbuf_alloc); - server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc; - } - memcpy(link->rcvbuf + link->rcvbuf_len, buf, nread); - link->rcvbuf_len += nread; - hdr = (clusterMsg*) link->rcvbuf; - rcvbuflen += nread; - } - - /* Total length obtained? Process this packet. */ - if (rcvbuflen >= 8 && rcvbuflen == ntohl(hdr->totlen)) { - if (clusterProcessPacket(link)) { - if (link->rcvbuf_alloc > RCVBUF_INIT_LEN) { - size_t prev_rcvbuf_alloc = link->rcvbuf_alloc; - zfree(link->rcvbuf); - link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN); - server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc; - } - link->rcvbuf_len = 0; - } else { - return; /* Link no longer valid. */ - } - } - } -} - -/* Put the message block into the link's send queue. - * - * It is guaranteed that this function will never have as a side effect - * the link to be invalidated, so it is safe to call this function - * from event handlers that will do stuff with the same link later. */ -void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) { - if (!link) { - return; - } - if (listLength(link->send_msg_queue) == 0 && msgblock->msg.totlen != 0) - connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1); - - listAddNodeTail(link->send_msg_queue, msgblock); - msgblock->refcount++; - - /* Update memory tracking */ - link->send_msg_queue_mem += sizeof(listNode) + msgblock->totlen; - server.stat_cluster_links_memory += sizeof(listNode); - - /* Populate sent messages stats. */ - uint16_t type = ntohs(msgblock->msg.type); - if (type < CLUSTERMSG_TYPE_COUNT) - server.cluster->stats_bus_messages_sent[type]++; -} - -/* Send a message to all the nodes that are part of the cluster having - * a connected link. - * - * It is guaranteed that this function will never have as a side effect - * some node->link to be invalidated, so it is safe to call this function - * from event handlers that will do stuff with node links later. */ -void clusterBroadcastMessage(clusterMsgSendBlock *msgblock) { - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) - continue; - clusterSendMessage(node->link,msgblock); - } - dictReleaseIterator(di); -} - -/* Build the message header. hdr must point to a buffer at least - * sizeof(clusterMsg) in bytes. */ -static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen) { - uint64_t offset; - clusterNode *master; - - /* If this node is a master, we send its slots bitmap and configEpoch. - * If this node is a slave we send the master's information instead (the - * node is flagged as slave so the receiver knows that it is NOT really - * in charge for this slots. */ - master = (nodeIsSlave(myself) && myself->slaveof) ? - myself->slaveof : myself; - - hdr->ver = htons(CLUSTER_PROTO_VER); - hdr->sig[0] = 'R'; - hdr->sig[1] = 'C'; - hdr->sig[2] = 'm'; - hdr->sig[3] = 'b'; - hdr->type = htons(type); - memcpy(hdr->sender,myself->name,CLUSTER_NAMELEN); - - /* If cluster-announce-ip option is enabled, force the receivers of our - * packets to use the specified address for this node. Otherwise if the - * first byte is zero, they'll do auto discovery. */ - memset(hdr->myip,0,NET_IP_STR_LEN); - if (server.cluster_announce_ip) { - redis_strlcpy(hdr->myip,server.cluster_announce_ip,NET_IP_STR_LEN); - } - - /* Handle cluster-announce-[tls-|bus-]port. */ - int announced_tcp_port, announced_tls_port, announced_cport; - deriveAnnouncedPorts(&announced_tcp_port, &announced_tls_port, &announced_cport); - - memcpy(hdr->myslots,master->slots,sizeof(hdr->myslots)); - memset(hdr->slaveof,0,CLUSTER_NAMELEN); - if (myself->slaveof != NULL) - memcpy(hdr->slaveof,myself->slaveof->name, CLUSTER_NAMELEN); - if (server.tls_cluster) { - hdr->port = htons(announced_tls_port); - hdr->pport = htons(announced_tcp_port); - } else { - hdr->port = htons(announced_tcp_port); - hdr->pport = htons(announced_tls_port); - } - hdr->cport = htons(announced_cport); - hdr->flags = htons(myself->flags); - hdr->state = server.cluster->state; - - /* Set the currentEpoch and configEpochs. */ - hdr->currentEpoch = htonu64(server.cluster->currentEpoch); - hdr->configEpoch = htonu64(master->configEpoch); - - /* Set the replication offset. */ - if (nodeIsSlave(myself)) - offset = replicationGetSlaveOffset(); - else - offset = server.master_repl_offset; - hdr->offset = htonu64(offset); - - /* Set the message flags. */ - if (nodeIsMaster(myself) && server.cluster->mf_end) - hdr->mflags[0] |= CLUSTERMSG_FLAG0_PAUSED; - - hdr->totlen = htonl(msglen); -} - -/* Set the i-th entry of the gossip section in the message pointed by 'hdr' - * to the info of the specified node 'n'. */ -void clusterSetGossipEntry(clusterMsg *hdr, int i, clusterNode *n) { - clusterMsgDataGossip *gossip; - gossip = &(hdr->data.ping.gossip[i]); - memcpy(gossip->nodename,n->name,CLUSTER_NAMELEN); - gossip->ping_sent = htonl(n->ping_sent/1000); - gossip->pong_received = htonl(n->pong_received/1000); - memcpy(gossip->ip,n->ip,sizeof(n->ip)); - if (server.tls_cluster) { - gossip->port = htons(n->tls_port); - gossip->pport = htons(n->tcp_port); - } else { - gossip->port = htons(n->tcp_port); - gossip->pport = htons(n->tls_port); - } - gossip->cport = htons(n->cport); - gossip->flags = htons(n->flags); - gossip->notused1 = 0; -} - -/* Send a PING or PONG packet to the specified node, making sure to add enough - * gossip information. */ -void clusterSendPing(clusterLink *link, int type) { - static unsigned long long cluster_pings_sent = 0; - cluster_pings_sent++; - int gossipcount = 0; /* Number of gossip sections added so far. */ - int wanted; /* Number of gossip sections we want to append if possible. */ - int estlen; /* Upper bound on estimated packet length */ - /* freshnodes is the max number of nodes we can hope to append at all: - * nodes available minus two (ourself and the node we are sending the - * message to). However practically there may be less valid nodes since - * nodes in handshake state, disconnected, are not considered. */ - int freshnodes = dictSize(server.cluster->nodes)-2; - - /* How many gossip sections we want to add? 1/10 of the number of nodes - * and anyway at least 3. Why 1/10? - * - * If we have N masters, with N/10 entries, and we consider that in - * node_timeout we exchange with each other node at least 4 packets - * (we ping in the worst case in node_timeout/2 time, and we also - * receive two pings from the host), we have a total of 8 packets - * in the node_timeout*2 failure reports validity time. So we have - * that, for a single PFAIL node, we can expect to receive the following - * number of failure reports (in the specified window of time): - * - * PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS: - * - * PROB = probability of being featured in a single gossip entry, - * which is 1 / NUM_OF_NODES. - * ENTRIES = 10. - * TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS. - * - * If we assume we have just masters (so num of nodes and num of masters - * is the same), with 1/10 we always get over the majority, and specifically - * 80% of the number of nodes, to account for many masters failing at the - * same time. - * - * Since we have non-voting slaves that lower the probability of an entry - * to feature our node, we set the number of entries per packet as - * 10% of the total nodes we have. */ - wanted = floor(dictSize(server.cluster->nodes)/10); - if (wanted < 3) wanted = 3; - if (wanted > freshnodes) wanted = freshnodes; - - /* Include all the nodes in PFAIL state, so that failure reports are - * faster to propagate to go from PFAIL to FAIL state. */ - int pfail_wanted = server.cluster->stats_pfail_nodes; - - /* Compute the maximum estlen to allocate our buffer. We'll fix the estlen - * later according to the number of gossip sections we really were able - * to put inside the packet. */ - estlen = sizeof(clusterMsg) - sizeof(union clusterMsgData); - estlen += (sizeof(clusterMsgDataGossip)*(wanted + pfail_wanted)); - estlen += writePingExt(NULL, 0); - /* Note: clusterBuildMessageHdr() expects the buffer to be always at least - * sizeof(clusterMsg) or more. */ - if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen); - clusterMsg *hdr = &msgblock->msg; - - if (!link->inbound && type == CLUSTERMSG_TYPE_PING) - link->node->ping_sent = mstime(); - - /* Populate the gossip fields */ - int maxiterations = wanted*3; - while(freshnodes > 0 && gossipcount < wanted && maxiterations--) { - dictEntry *de = dictGetRandomKey(server.cluster->nodes); - clusterNode *this = dictGetVal(de); - - /* Don't include this node: the whole packet header is about us - * already, so we just gossip about other nodes. */ - if (this == myself) continue; - - /* PFAIL nodes will be added later. */ - if (this->flags & CLUSTER_NODE_PFAIL) continue; - - /* In the gossip section don't include: - * 1) Nodes in HANDSHAKE state. - * 3) Nodes with the NOADDR flag set. - * 4) Disconnected nodes if they don't have configured slots. - */ - if (this->flags & (CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_NOADDR) || - (this->link == NULL && this->numslots == 0)) - { - freshnodes--; /* Technically not correct, but saves CPU. */ - continue; - } - - /* Do not add a node we already have. */ - if (this->last_in_ping_gossip == cluster_pings_sent) continue; - - /* Add it */ - clusterSetGossipEntry(hdr,gossipcount,this); - this->last_in_ping_gossip = cluster_pings_sent; - freshnodes--; - gossipcount++; - } - - /* If there are PFAIL nodes, add them at the end. */ - if (pfail_wanted) { - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL && pfail_wanted > 0) { - clusterNode *node = dictGetVal(de); - if (node->flags & CLUSTER_NODE_HANDSHAKE) continue; - if (node->flags & CLUSTER_NODE_NOADDR) continue; - if (!(node->flags & CLUSTER_NODE_PFAIL)) continue; - clusterSetGossipEntry(hdr,gossipcount,node); - gossipcount++; - /* We take the count of the slots we allocated, since the - * PFAIL stats may not match perfectly with the current number - * of PFAIL nodes. */ - pfail_wanted--; - } - dictReleaseIterator(di); - } - - /* Compute the actual total length and send! */ - uint32_t totlen = 0; - totlen += writePingExt(hdr, gossipcount); - totlen += sizeof(clusterMsg)-sizeof(union clusterMsgData); - totlen += (sizeof(clusterMsgDataGossip)*gossipcount); - serverAssert(gossipcount < USHRT_MAX); - hdr->count = htons(gossipcount); - hdr->totlen = htonl(totlen); - - clusterSendMessage(link,msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Send a PONG packet to every connected node that's not in handshake state - * and for which we have a valid link. - * - * In Redis Cluster pongs are not used just for failure detection, but also - * to carry important configuration information. So broadcasting a pong is - * useful when something changes in the configuration and we want to make - * the cluster aware ASAP (for instance after a slave promotion). - * - * The 'target' argument specifies the receiving instances using the - * defines below: - * - * CLUSTER_BROADCAST_ALL -> All known instances. - * CLUSTER_BROADCAST_LOCAL_SLAVES -> All slaves in my master-slaves ring. - */ -#define CLUSTER_BROADCAST_ALL 0 -#define CLUSTER_BROADCAST_LOCAL_SLAVES 1 -void clusterBroadcastPong(int target) { - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (!node->link) continue; - if (node == myself || nodeInHandshake(node)) continue; - if (target == CLUSTER_BROADCAST_LOCAL_SLAVES) { - int local_slave = - nodeIsSlave(node) && node->slaveof && - (node->slaveof == myself || node->slaveof == myself->slaveof); - if (!local_slave) continue; - } - clusterSendPing(node->link,CLUSTERMSG_TYPE_PONG); - } - dictReleaseIterator(di); -} - -/* Create a PUBLISH message block. - * - * Sanitizer suppression: In clusterMsgDataPublish, sizeof(bulk_data) is 8. - * As all the struct is used as a buffer, when more than 8 bytes are copied into - * the 'bulk_data', sanitizer generates an out-of-bounds error which is a false - * positive in this context. */ -REDIS_NO_SANITIZE("bounds") -clusterMsgSendBlock *clusterCreatePublishMsgBlock(robj *channel, robj *message, uint16_t type) { - - uint32_t channel_len, message_len; - - channel = getDecodedObject(channel); - message = getDecodedObject(message); - channel_len = sdslen(channel->ptr); - message_len = sdslen(message->ptr); - - size_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - msglen += sizeof(clusterMsgDataPublish) - 8 + channel_len + message_len; - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, msglen); - - clusterMsg *hdr = &msgblock->msg; - hdr->data.publish.msg.channel_len = htonl(channel_len); - hdr->data.publish.msg.message_len = htonl(message_len); - memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr)); - memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr), - message->ptr,sdslen(message->ptr)); - - decrRefCount(channel); - decrRefCount(message); - - return msgblock; -} - -/* Send a FAIL message to all the nodes we are able to contact. - * The FAIL message is sent when we detect that a node is failing - * (CLUSTER_NODE_PFAIL) and we also receive a gossip confirmation of this: - * we switch the node state to CLUSTER_NODE_FAIL and ask all the other - * nodes to do the same ASAP. */ -void clusterSendFail(char *nodename) { - uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) - + sizeof(clusterMsgDataFail); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAIL, msglen); - - clusterMsg *hdr = &msgblock->msg; - memcpy(hdr->data.fail.about.nodename,nodename,CLUSTER_NAMELEN); - - clusterBroadcastMessage(msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Send an UPDATE message to the specified link carrying the specified 'node' - * slots configuration. The node name, slots bitmap, and configEpoch info - * are included. */ -void clusterSendUpdate(clusterLink *link, clusterNode *node) { - if (link == NULL) return; - - uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) - + sizeof(clusterMsgDataUpdate); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_UPDATE, msglen); - - clusterMsg *hdr = &msgblock->msg; - memcpy(hdr->data.update.nodecfg.nodename,node->name,CLUSTER_NAMELEN); - hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch); - memcpy(hdr->data.update.nodecfg.slots,node->slots,sizeof(node->slots)); - for (unsigned int i = 0; i < sizeof(node->slots); i++) { - /* Don't advertise slots that the node stopped claiming */ - hdr->data.update.nodecfg.slots[i] = hdr->data.update.nodecfg.slots[i] & (~server.cluster->owner_not_claiming_slot[i]); - } - - clusterSendMessage(link,msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Send a MODULE message. - * - * If link is NULL, then the message is broadcasted to the whole cluster. */ -void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type, - const char *payload, uint32_t len) { - uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - msglen += sizeof(clusterMsgModule) - 3 + len; - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MODULE, msglen); - - clusterMsg *hdr = &msgblock->msg; - hdr->data.module.msg.module_id = module_id; /* Already endian adjusted. */ - hdr->data.module.msg.type = type; - hdr->data.module.msg.len = htonl(len); - memcpy(hdr->data.module.msg.bulk_data,payload,len); - - if (link) - clusterSendMessage(link,msgblock); - else - clusterBroadcastMessage(msgblock); - - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* This function gets a cluster node ID string as target, the same way the nodes - * addresses are represented in the modules side, resolves the node, and sends - * the message. If the target is NULL the message is broadcasted. - * - * The function returns C_OK if the target is valid, otherwise C_ERR is - * returned. */ -int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len) { - clusterNode *node = NULL; - - if (target != NULL) { - node = clusterLookupNode(target, strlen(target)); - if (node == NULL || node->link == NULL) return C_ERR; - } - - clusterSendModule(target ? node->link : NULL, - module_id, type, payload, len); - return C_OK; -} - -/* ----------------------------------------------------------------------------- - * CLUSTER Pub/Sub support - * - * If `sharded` is 0: - * For now we do very little, just propagating [S]PUBLISH messages across the whole - * cluster. In the future we'll try to get smarter and avoiding propagating those - * messages to hosts without receives for a given channel. - * Otherwise: - * Publish this message across the slot (primary/replica). - * -------------------------------------------------------------------------- */ -void clusterPropagatePublish(robj *channel, robj *message, int sharded) { - clusterMsgSendBlock *msgblock; - - if (!sharded) { - msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISH); - clusterBroadcastMessage(msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); - return; - } - - listIter li; - listNode *ln; - list *nodes_for_slot = clusterGetNodesInMyShard(server.cluster->myself); - serverAssert(nodes_for_slot != NULL); - listRewind(nodes_for_slot, &li); - msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISHSHARD); - while((ln = listNext(&li))) { - clusterNode *node = listNodeValue(ln); - if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) - continue; - clusterSendMessage(node->link,msgblock); - } - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* ----------------------------------------------------------------------------- - * SLAVE node specific functions - * -------------------------------------------------------------------------- */ - -/* This function sends a FAILOVER_AUTH_REQUEST message to every node in order to - * see if there is the quorum for this slave instance to failover its failing - * master. - * - * Note that we send the failover request to everybody, master and slave nodes, - * but only the masters are supposed to reply to our query. */ -void clusterRequestFailoverAuth(void) { - uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST, msglen); - - clusterMsg *hdr = &msgblock->msg; - /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit - * in the header to communicate the nodes receiving the message that - * they should authorized the failover even if the master is working. */ - if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; - clusterBroadcastMessage(msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Send a FAILOVER_AUTH_ACK message to the specified node. */ -void clusterSendFailoverAuth(clusterNode *node) { - if (!node->link) return; - - uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK, msglen); - - clusterSendMessage(node->link,msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Send a MFSTART message to the specified node. */ -void clusterSendMFStart(clusterNode *node) { - if (!node->link) return; - - uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MFSTART, msglen); - - clusterSendMessage(node->link,msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); -} - -/* Vote for the node asking for our vote if there are the conditions. */ -void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { - clusterNode *master = node->slaveof; - uint64_t requestCurrentEpoch = ntohu64(request->currentEpoch); - uint64_t requestConfigEpoch = ntohu64(request->configEpoch); - unsigned char *claimed_slots = request->myslots; - int force_ack = request->mflags[0] & CLUSTERMSG_FLAG0_FORCEACK; - int j; - - /* IF we are not a master serving at least 1 slot, we don't have the - * right to vote, as the cluster size in Redis Cluster is the number - * of masters serving at least one slot, and quorum is the cluster - * size + 1 */ - if (nodeIsSlave(myself) || myself->numslots == 0) return; - - /* Request epoch must be >= our currentEpoch. - * Note that it is impossible for it to actually be greater since - * our currentEpoch was updated as a side effect of receiving this - * request, if the request epoch was greater. */ - if (requestCurrentEpoch < server.cluster->currentEpoch) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): reqEpoch (%llu) < curEpoch(%llu)", - node->name, node->human_nodename, - (unsigned long long) requestCurrentEpoch, - (unsigned long long) server.cluster->currentEpoch); - return; - } - - /* I already voted for this epoch? Return ASAP. */ - if (server.cluster->lastVoteEpoch == server.cluster->currentEpoch) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): already voted for epoch %llu", - node->name, node->human_nodename, - (unsigned long long) server.cluster->currentEpoch); - return; - } - - /* Node must be a slave and its master down. - * The master can be non failing if the request is flagged - * with CLUSTERMSG_FLAG0_FORCEACK (manual failover). */ - if (nodeIsMaster(node) || master == NULL || - (!nodeFailed(master) && !force_ack)) - { - if (nodeIsMaster(node)) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): it is a master node", - node->name, node->human_nodename); - } else if (master == NULL) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): I don't know its master", - node->name, node->human_nodename); - } else if (!nodeFailed(master)) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): its master is up", - node->name, node->human_nodename); - } - return; - } - - /* We did not voted for a slave about this master for two - * times the node timeout. This is not strictly needed for correctness - * of the algorithm but makes the base case more linear. */ - if (mstime() - node->slaveof->voted_time < server.cluster_node_timeout * 2) - { - serverLog(LL_WARNING, - "Failover auth denied to %.40s %s: " - "can't vote about this master before %lld milliseconds", - node->name, node->human_nodename, - (long long) ((server.cluster_node_timeout*2)- - (mstime() - node->slaveof->voted_time))); - return; - } - - /* The slave requesting the vote must have a configEpoch for the claimed - * slots that is >= the one of the masters currently serving the same - * slots in the current configuration. */ - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (bitmapTestBit(claimed_slots, j) == 0) continue; - if (isSlotUnclaimed(j) || - server.cluster->slots[j]->configEpoch <= requestConfigEpoch) - { - continue; - } - /* If we reached this point we found a slot that in our current slots - * is served by a master with a greater configEpoch than the one claimed - * by the slave requesting our vote. Refuse to vote for this slave. */ - serverLog(LL_WARNING, - "Failover auth denied to %.40s (%s): " - "slot %d epoch (%llu) > reqEpoch (%llu)", - node->name, node->human_nodename, j, - (unsigned long long) server.cluster->slots[j]->configEpoch, - (unsigned long long) requestConfigEpoch); - return; - } - - /* We can vote for this slave. */ - server.cluster->lastVoteEpoch = server.cluster->currentEpoch; - node->slaveof->voted_time = mstime(); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG); - clusterSendFailoverAuth(node); - serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", - node->name, node->human_nodename, (unsigned long long) server.cluster->currentEpoch); -} - -/* This function returns the "rank" of this instance, a slave, in the context - * of its master-slaves ring. The rank of the slave is given by the number of - * other slaves for the same master that have a better replication offset - * compared to the local one (better means, greater, so they claim more data). - * - * A slave with rank 0 is the one with the greatest (most up to date) - * replication offset, and so forth. Note that because how the rank is computed - * multiple slaves may have the same rank, in case they have the same offset. - * - * The slave rank is used to add a delay to start an election in order to - * get voted and replace a failing master. Slaves with better replication - * offsets are more likely to win. */ -int clusterGetSlaveRank(void) { - long long myoffset; - int j, rank = 0; - clusterNode *master; - - serverAssert(nodeIsSlave(myself)); - master = myself->slaveof; - if (master == NULL) return 0; /* Never called by slaves without master. */ - - myoffset = replicationGetSlaveOffset(); - for (j = 0; j < master->numslaves; j++) - if (master->slaves[j] != myself && - !nodeCantFailover(master->slaves[j]) && - master->slaves[j]->repl_offset > myoffset) rank++; - return rank; -} - -/* This function is called by clusterHandleSlaveFailover() in order to - * let the slave log why it is not able to failover. Sometimes there are - * not the conditions, but since the failover function is called again and - * again, we can't log the same things continuously. - * - * This function works by logging only if a given set of conditions are - * true: - * - * 1) The reason for which the failover can't be initiated changed. - * The reasons also include a NONE reason we reset the state to - * when the slave finds that its master is fine (no FAIL flag). - * 2) Also, the log is emitted again if the master is still down and - * the reason for not failing over is still the same, but more than - * CLUSTER_CANT_FAILOVER_RELOG_PERIOD seconds elapsed. - * 3) Finally, the function only logs if the slave is down for more than - * five seconds + NODE_TIMEOUT. This way nothing is logged when a - * failover starts in a reasonable time. - * - * The function is called with the reason why the slave can't failover - * which is one of the integer macros CLUSTER_CANT_FAILOVER_*. - * - * The function is guaranteed to be called only if 'myself' is a slave. */ -void clusterLogCantFailover(int reason) { - char *msg; - static time_t lastlog_time = 0; - mstime_t nolog_fail_time = server.cluster_node_timeout + 5000; - - /* Don't log if we have the same reason for some time. */ - if (reason == server.cluster->cant_failover_reason && - time(NULL)-lastlog_time < CLUSTER_CANT_FAILOVER_RELOG_PERIOD) - return; - - server.cluster->cant_failover_reason = reason; - - /* We also don't emit any log if the master failed no long ago, the - * goal of this function is to log slaves in a stalled condition for - * a long time. */ - if (myself->slaveof && - nodeFailed(myself->slaveof) && - (mstime() - myself->slaveof->fail_time) < nolog_fail_time) return; - - switch(reason) { - case CLUSTER_CANT_FAILOVER_DATA_AGE: - msg = "Disconnected from master for longer than allowed. " - "Please check the 'cluster-replica-validity-factor' configuration " - "option."; - break; - case CLUSTER_CANT_FAILOVER_WAITING_DELAY: - msg = "Waiting the delay before I can start a new failover."; - break; - case CLUSTER_CANT_FAILOVER_EXPIRED: - msg = "Failover attempt expired."; - break; - case CLUSTER_CANT_FAILOVER_WAITING_VOTES: - msg = "Waiting for votes, but majority still not reached."; - break; - default: - msg = "Unknown reason code."; - break; - } - lastlog_time = time(NULL); - serverLog(LL_NOTICE,"Currently unable to failover: %s", msg); - - int cur_vote = server.cluster->failover_auth_count; - int cur_quorum = (server.cluster->size / 2) + 1; - /* Emits a log when an election is in progress and waiting for votes or when the failover attempt expired. */ - if (reason == CLUSTER_CANT_FAILOVER_WAITING_VOTES || reason == CLUSTER_CANT_FAILOVER_EXPIRED) { - serverLog(LL_NOTICE, "Needed quorum: %d. Number of votes received so far: %d", cur_quorum, cur_vote); - } -} - -/* This function implements the final part of automatic and manual failovers, - * where the slave grabs its master's hash slots, and propagates the new - * configuration. - * - * Note that it's up to the caller to be sure that the node got a new - * configuration epoch already. */ -void clusterFailoverReplaceYourMaster(void) { - int j; - clusterNode *oldmaster = myself->slaveof; - - if (nodeIsMaster(myself) || oldmaster == NULL) return; - - /* 1) Turn this node into a master. */ - clusterSetNodeAsMaster(myself); - replicationUnsetMaster(); - - /* 2) Claim all the slots assigned to our master. */ - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (clusterNodeGetSlotBit(oldmaster,j)) { - clusterDelSlot(j); - clusterAddSlot(myself,j); - } - } - - /* 3) Update state and save config. */ - clusterUpdateState(); - clusterSaveConfigOrDie(1); - - /* 4) Pong all the other nodes so that they can update the state - * accordingly and detect that we switched to master role. */ - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); - - /* 5) If there was a manual failover in progress, clear the state. */ - resetManualFailover(); -} - -/* This function is called if we are a slave node and our master serving - * a non-zero amount of hash slots is in FAIL state. - * - * The goal of this function is: - * 1) To check if we are able to perform a failover, is our data updated? - * 2) Try to get elected by masters. - * 3) Perform the failover informing all the other nodes. - */ -void clusterHandleSlaveFailover(void) { - mstime_t data_age; - mstime_t auth_age = mstime() - server.cluster->failover_auth_time; - int needed_quorum = (server.cluster->size / 2) + 1; - int manual_failover = server.cluster->mf_end != 0 && - server.cluster->mf_can_start; - mstime_t auth_timeout, auth_retry_time; - - server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_FAILOVER; - - /* Compute the failover timeout (the max time we have to send votes - * and wait for replies), and the failover retry time (the time to wait - * before trying to get voted again). - * - * Timeout is MAX(NODE_TIMEOUT*2,2000) milliseconds. - * Retry is two times the Timeout. - */ - auth_timeout = server.cluster_node_timeout*2; - if (auth_timeout < 2000) auth_timeout = 2000; - auth_retry_time = auth_timeout*2; - - /* Pre conditions to run the function, that must be met both in case - * of an automatic or manual failover: - * 1) We are a slave. - * 2) Our master is flagged as FAIL, or this is a manual failover. - * 3) We don't have the no failover configuration set, and this is - * not a manual failover. - * 4) It is serving slots. */ - if (nodeIsMaster(myself) || - myself->slaveof == NULL || - (!nodeFailed(myself->slaveof) && !manual_failover) || - (server.cluster_slave_no_failover && !manual_failover) || - myself->slaveof->numslots == 0) - { - /* There are no reasons to failover, so we set the reason why we - * are returning without failing over to NONE. */ - server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; - return; - } - - /* Set data_age to the number of milliseconds we are disconnected from - * the master. */ - if (server.repl_state == REPL_STATE_CONNECTED) { - data_age = (mstime_t)(server.unixtime - server.master->lastinteraction) - * 1000; - } else { - data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000; - } - - /* Remove the node timeout from the data age as it is fine that we are - * disconnected from our master at least for the time it was down to be - * flagged as FAIL, that's the baseline. */ - if (data_age > server.cluster_node_timeout) - data_age -= server.cluster_node_timeout; - - /* Check if our data is recent enough according to the slave validity - * factor configured by the user. - * - * Check bypassed for manual failovers. */ - if (server.cluster_slave_validity_factor && - data_age > - (((mstime_t)server.repl_ping_slave_period * 1000) + - (server.cluster_node_timeout * server.cluster_slave_validity_factor))) - { - if (!manual_failover) { - clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE); - return; - } - } - - /* If the previous failover attempt timeout and the retry time has - * elapsed, we can setup a new one. */ - if (auth_age > auth_retry_time) { - server.cluster->failover_auth_time = mstime() + - 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */ - random() % 500; /* Random delay between 0 and 500 milliseconds. */ - server.cluster->failover_auth_count = 0; - server.cluster->failover_auth_sent = 0; - server.cluster->failover_auth_rank = clusterGetSlaveRank(); - /* We add another delay that is proportional to the slave rank. - * Specifically 1 second * rank. This way slaves that have a probably - * less updated replication offset, are penalized. */ - server.cluster->failover_auth_time += - server.cluster->failover_auth_rank * 1000; - /* However if this is a manual failover, no delay is needed. */ - if (server.cluster->mf_end) { - server.cluster->failover_auth_time = mstime(); - server.cluster->failover_auth_rank = 0; - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); - } - serverLog(LL_NOTICE, - "Start of election delayed for %lld milliseconds " - "(rank #%d, offset %lld).", - server.cluster->failover_auth_time - mstime(), - server.cluster->failover_auth_rank, - replicationGetSlaveOffset()); - /* Now that we have a scheduled election, broadcast our offset - * to all the other slaves so that they'll updated their offsets - * if our offset is better. */ - clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_SLAVES); - return; - } - - /* It is possible that we received more updated offsets from other - * slaves for the same master since we computed our election delay. - * Update the delay if our rank changed. - * - * Not performed if this is a manual failover. */ - if (server.cluster->failover_auth_sent == 0 && - server.cluster->mf_end == 0) - { - int newrank = clusterGetSlaveRank(); - if (newrank > server.cluster->failover_auth_rank) { - long long added_delay = - (newrank - server.cluster->failover_auth_rank) * 1000; - server.cluster->failover_auth_time += added_delay; - server.cluster->failover_auth_rank = newrank; - serverLog(LL_NOTICE, - "Replica rank updated to #%d, added %lld milliseconds of delay.", - newrank, added_delay); - } - } - - /* Return ASAP if we can't still start the election. */ - if (mstime() < server.cluster->failover_auth_time) { - clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY); - return; - } - - /* Return ASAP if the election is too old to be valid. */ - if (auth_age > auth_timeout) { - clusterLogCantFailover(CLUSTER_CANT_FAILOVER_EXPIRED); - return; - } - - /* Ask for votes if needed. */ - if (server.cluster->failover_auth_sent == 0) { - server.cluster->currentEpoch++; - server.cluster->failover_auth_epoch = server.cluster->currentEpoch; - serverLog(LL_NOTICE,"Starting a failover election for epoch %llu.", - (unsigned long long) server.cluster->currentEpoch); - clusterRequestFailoverAuth(); - server.cluster->failover_auth_sent = 1; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| - CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_FSYNC_CONFIG); - return; /* Wait for replies. */ - } - - /* Check if we reached the quorum. */ - if (server.cluster->failover_auth_count >= needed_quorum) { - /* We have the quorum, we can finally failover the master. */ - - serverLog(LL_NOTICE, - "Failover election won: I'm the new master."); - - /* Update my configEpoch to the epoch of the election. */ - if (myself->configEpoch < server.cluster->failover_auth_epoch) { - myself->configEpoch = server.cluster->failover_auth_epoch; - serverLog(LL_NOTICE, - "configEpoch set to %llu after successful failover", - (unsigned long long) myself->configEpoch); - } - - /* Take responsibility for the cluster slots. */ - clusterFailoverReplaceYourMaster(); - } else { - clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_VOTES); - } -} - -/* ----------------------------------------------------------------------------- - * CLUSTER slave migration - * - * Slave migration is the process that allows a slave of a master that is - * already covered by at least another slave, to "migrate" to a master that - * is orphaned, that is, left with no working slaves. - * ------------------------------------------------------------------------- */ - -/* This function is responsible to decide if this replica should be migrated - * to a different (orphaned) master. It is called by the clusterCron() function - * only if: - * - * 1) We are a slave node. - * 2) It was detected that there is at least one orphaned master in - * the cluster. - * 3) We are a slave of one of the masters with the greatest number of - * slaves. - * - * This checks are performed by the caller since it requires to iterate - * the nodes anyway, so we spend time into clusterHandleSlaveMigration() - * if definitely needed. - * - * The function is called with a pre-computed max_slaves, that is the max - * number of working (not in FAIL state) slaves for a single master. - * - * Additional conditions for migration are examined inside the function. - */ -void clusterHandleSlaveMigration(int max_slaves) { - int j, okslaves = 0; - clusterNode *mymaster = myself->slaveof, *target = NULL, *candidate = NULL; - dictIterator *di; - dictEntry *de; - - /* Step 1: Don't migrate if the cluster state is not ok. */ - if (server.cluster->state != CLUSTER_OK) return; - - /* Step 2: Don't migrate if my master will not be left with at least - * 'migration-barrier' slaves after my migration. */ - if (mymaster == NULL) return; - for (j = 0; j < mymaster->numslaves; j++) - if (!nodeFailed(mymaster->slaves[j]) && - !nodeTimedOut(mymaster->slaves[j])) okslaves++; - if (okslaves <= server.cluster_migration_barrier) return; - - /* Step 3: Identify a candidate for migration, and check if among the - * masters with the greatest number of ok slaves, I'm the one with the - * smallest node ID (the "candidate slave"). - * - * Note: this means that eventually a replica migration will occur - * since slaves that are reachable again always have their FAIL flag - * cleared, so eventually there must be a candidate. - * There is a possible race condition causing multiple - * slaves to migrate at the same time, but this is unlikely to - * happen and relatively harmless when it does. */ - candidate = myself; - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - int okslaves = 0, is_orphaned = 1; - - /* We want to migrate only if this master is working, orphaned, and - * used to have slaves or if failed over a master that had slaves - * (MIGRATE_TO flag). This way we only migrate to instances that were - * supposed to have replicas. */ - if (nodeIsSlave(node) || nodeFailed(node)) is_orphaned = 0; - if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) is_orphaned = 0; - - /* Check number of working slaves. */ - if (nodeIsMaster(node)) okslaves = clusterCountNonFailingSlaves(node); - if (okslaves > 0) is_orphaned = 0; - - if (is_orphaned) { - if (!target && node->numslots > 0) target = node; - - /* Track the starting time of the orphaned condition for this - * master. */ - if (!node->orphaned_time) node->orphaned_time = mstime(); - } else { - node->orphaned_time = 0; - } - - /* Check if I'm the slave candidate for the migration: attached - * to a master with the maximum number of slaves and with the smallest - * node ID. */ - if (okslaves == max_slaves) { - for (j = 0; j < node->numslaves; j++) { - if (memcmp(node->slaves[j]->name, - candidate->name, - CLUSTER_NAMELEN) < 0) - { - candidate = node->slaves[j]; - } - } - } - } - dictReleaseIterator(di); - - /* Step 4: perform the migration if there is a target, and if I'm the - * candidate, but only if the master is continuously orphaned for a - * couple of seconds, so that during failovers, we give some time to - * the natural slaves of this instance to advertise their switch from - * the old master to the new one. */ - if (target && candidate == myself && - (mstime()-target->orphaned_time) > CLUSTER_SLAVE_MIGRATION_DELAY && - !(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) - { - serverLog(LL_NOTICE,"Migrating to orphaned master %.40s", - target->name); - clusterSetMaster(target); - } -} - -/* ----------------------------------------------------------------------------- - * CLUSTER manual failover - * - * This are the important steps performed by slaves during a manual failover: - * 1) User send CLUSTER FAILOVER command. The failover state is initialized - * setting mf_end to the millisecond unix time at which we'll abort the - * attempt. - * 2) Slave sends a MFSTART message to the master requesting to pause clients - * for two times the manual failover timeout CLUSTER_MF_TIMEOUT. - * When master is paused for manual failover, it also starts to flag - * packets with CLUSTERMSG_FLAG0_PAUSED. - * 3) Slave waits for master to send its replication offset flagged as PAUSED. - * 4) If slave received the offset from the master, and its offset matches, - * mf_can_start is set to 1, and clusterHandleSlaveFailover() will perform - * the failover as usually, with the difference that the vote request - * will be modified to force masters to vote for a slave that has a - * working master. - * - * From the point of view of the master things are simpler: when a - * PAUSE_CLIENTS packet is received the master sets mf_end as well and - * the sender in mf_slave. During the time limit for the manual failover - * the master will just send PINGs more often to this slave, flagged with - * the PAUSED flag, so that the slave will set mf_master_offset when receiving - * a packet from the master with this flag set. - * - * The goal of the manual failover is to perform a fast failover without - * data loss due to the asynchronous master-slave replication. - * -------------------------------------------------------------------------- */ - -/* Reset the manual failover state. This works for both masters and slaves - * as all the state about manual failover is cleared. - * - * The function can be used both to initialize the manual failover state at - * startup or to abort a manual failover in progress. */ -void resetManualFailover(void) { - if (server.cluster->mf_slave) { - /* We were a master failing over, so we paused clients and related actions. - * Regardless of the outcome we unpause now to allow traffic again. */ - unpauseActions(PAUSE_DURING_FAILOVER); - } - server.cluster->mf_end = 0; /* No manual failover in progress. */ - server.cluster->mf_can_start = 0; - server.cluster->mf_slave = NULL; - server.cluster->mf_master_offset = -1; -} - -/* If a manual failover timed out, abort it. */ -void manualFailoverCheckTimeout(void) { - if (server.cluster->mf_end && server.cluster->mf_end < mstime()) { - serverLog(LL_WARNING,"Manual failover timed out."); - resetManualFailover(); - } -} - -/* This function is called from the cluster cron function in order to go - * forward with a manual failover state machine. */ -void clusterHandleManualFailover(void) { - /* Return ASAP if no manual failover is in progress. */ - if (server.cluster->mf_end == 0) return; - - /* If mf_can_start is non-zero, the failover was already triggered so the - * next steps are performed by clusterHandleSlaveFailover(). */ - if (server.cluster->mf_can_start) return; - - if (server.cluster->mf_master_offset == -1) return; /* Wait for offset... */ - - if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) { - /* Our replication offset matches the master replication offset - * announced after clients were paused. We can start the failover. */ - server.cluster->mf_can_start = 1; - serverLog(LL_NOTICE, - "All master replication stream processed, " - "manual failover can start."); - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); - return; - } - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); -} - -/* ----------------------------------------------------------------------------- - * CLUSTER cron job - * -------------------------------------------------------------------------- */ - -/* Check if the node is disconnected and re-establish the connection. - * Also update a few stats while we are here, that can be used to make - * better decisions in other part of the code. */ -static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_timeout, mstime_t now) { - /* Not interested in reconnecting the link with myself or nodes - * for which we have no address. */ - if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR)) return 1; - - if (node->flags & CLUSTER_NODE_PFAIL) - server.cluster->stats_pfail_nodes++; - - /* A Node in HANDSHAKE state has a limited lifespan equal to the - * configured node timeout. */ - if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) { - clusterDelNode(node); - return 1; - } - - if (node->link == NULL) { - clusterLink *link = createClusterLink(node); - link->conn = connCreate(connTypeOfCluster()); - connSetPrivateData(link->conn, link); - if (connConnect(link->conn, node->ip, node->cport, server.bind_source_addr, - clusterLinkConnectHandler) == C_ERR) { - /* We got a synchronous error from connect before - * clusterSendPing() had a chance to be called. - * If node->ping_sent is zero, failure detection can't work, - * so we claim we actually sent a ping now (that will - * be really sent as soon as the link is obtained). */ - if (node->ping_sent == 0) node->ping_sent = mstime(); - serverLog(LL_DEBUG, "Unable to connect to " - "Cluster Node [%s]:%d -> %s", node->ip, - node->cport, server.neterr); - - freeClusterLink(link); - return 0; - } - } - return 0; -} - -static void freeClusterLinkOnBufferLimitReached(clusterLink *link) { - if (link == NULL || server.cluster_link_msg_queue_limit_bytes == 0) { - return; - } - - unsigned long long mem_link = link->send_msg_queue_mem; - if (mem_link > server.cluster_link_msg_queue_limit_bytes) { - serverLog(LL_WARNING, "Freeing cluster link(%s node %.40s, used memory: %llu) due to " - "exceeding send buffer memory limit.", link->inbound ? "from" : "to", - link->node ? link->node->name : "", mem_link); - freeClusterLink(link); - server.cluster->stat_cluster_links_buffer_limit_exceeded++; - } -} - -/* Free outbound link to a node if its send buffer size exceeded limit. */ -static void clusterNodeCronFreeLinkOnBufferLimitReached(clusterNode *node) { - freeClusterLinkOnBufferLimitReached(node->link); - freeClusterLinkOnBufferLimitReached(node->inbound_link); -} - -/* This is executed 10 times every second */ -void clusterCron(void) { - dictIterator *di; - dictEntry *de; - int update_state = 0; - int orphaned_masters; /* How many masters there are without ok slaves. */ - int max_slaves; /* Max number of ok slaves for a single master. */ - int this_slaves; /* Number of ok slaves for our master (if we are slave). */ - mstime_t min_pong = 0, now = mstime(); - clusterNode *min_pong_node = NULL; - static unsigned long long iteration = 0; - mstime_t handshake_timeout; - - iteration++; /* Number of times this function was called so far. */ - - clusterUpdateMyselfHostname(); - - /* The handshake timeout is the time after which a handshake node that was - * not turned into a normal node is removed from the nodes. Usually it is - * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use - * the value of 1 second. */ - handshake_timeout = server.cluster_node_timeout; - if (handshake_timeout < 1000) handshake_timeout = 1000; - - /* Clear so clusterNodeCronHandleReconnect can count the number of nodes in PFAIL. */ - server.cluster->stats_pfail_nodes = 0; - /* Run through some of the operations we want to do on each cluster node. */ - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - /* We free the inbound or outboud link to the node if the link has an - * oversized message send queue and immediately try reconnecting. */ - clusterNodeCronFreeLinkOnBufferLimitReached(node); - /* The protocol is that function(s) below return non-zero if the node was - * terminated. - */ - if(clusterNodeCronHandleReconnect(node, handshake_timeout, now)) continue; - } - dictReleaseIterator(di); - - /* Ping some random node 1 time every 10 iterations, so that we usually ping - * one random node every second. */ - if (!(iteration % 10)) { - int j; - - /* Check a few random nodes and ping the one with the oldest - * pong_received time. */ - for (j = 0; j < 5; j++) { - de = dictGetRandomKey(server.cluster->nodes); - clusterNode *this = dictGetVal(de); - - /* Don't ping nodes disconnected or with a ping currently active. */ - if (this->link == NULL || this->ping_sent != 0) continue; - if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) - continue; - if (min_pong_node == NULL || min_pong > this->pong_received) { - min_pong_node = this; - min_pong = this->pong_received; - } - } - if (min_pong_node) { - serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name); - clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING); - } - } - - /* Iterate nodes to check if we need to flag something as failing. - * This loop is also responsible to: - * 1) Check if there are orphaned masters (masters without non failing - * slaves). - * 2) Count the max number of non failing slaves for a single master. - * 3) Count the number of slaves for our master, if we are a slave. */ - orphaned_masters = 0; - max_slaves = 0; - this_slaves = 0; - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - now = mstime(); /* Use an updated time at every iteration. */ - - if (node->flags & - (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) - continue; - - /* Orphaned master check, useful only if the current instance - * is a slave that may migrate to another master. */ - if (nodeIsSlave(myself) && nodeIsMaster(node) && !nodeFailed(node)) { - int okslaves = clusterCountNonFailingSlaves(node); - - /* A master is orphaned if it is serving a non-zero number of - * slots, have no working slaves, but used to have at least one - * slave, or failed over a master that used to have slaves. */ - if (okslaves == 0 && node->numslots > 0 && - node->flags & CLUSTER_NODE_MIGRATE_TO) - { - orphaned_masters++; - } - if (okslaves > max_slaves) max_slaves = okslaves; - if (myself->slaveof == node) - this_slaves = okslaves; - } - - /* If we are not receiving any data for more than half the cluster - * timeout, reconnect the link: maybe there is a connection - * issue even if the node is alive. */ - mstime_t ping_delay = now - node->ping_sent; - mstime_t data_delay = now - node->data_received; - if (node->link && /* is connected */ - now - node->link->ctime > - server.cluster_node_timeout && /* was not already reconnected */ - node->ping_sent && /* we already sent a ping */ - /* and we are waiting for the pong more than timeout/2 */ - ping_delay > server.cluster_node_timeout/2 && - /* and in such interval we are not seeing any traffic at all. */ - data_delay > server.cluster_node_timeout/2) - { - /* Disconnect the link, it will be reconnected automatically. */ - freeClusterLink(node->link); - } - - /* If we have currently no active ping in this instance, and the - * received PONG is older than half the cluster timeout, send - * a new ping now, to ensure all the nodes are pinged without - * a too big delay. */ - mstime_t ping_interval = server.cluster_ping_interval ? - server.cluster_ping_interval : server.cluster_node_timeout/2; - if (node->link && - node->ping_sent == 0 && - (now - node->pong_received) > ping_interval) - { - clusterSendPing(node->link, CLUSTERMSG_TYPE_PING); - continue; - } - - /* If we are a master and one of the slaves requested a manual - * failover, ping it continuously. */ - if (server.cluster->mf_end && - nodeIsMaster(myself) && - server.cluster->mf_slave == node && - node->link) - { - clusterSendPing(node->link, CLUSTERMSG_TYPE_PING); - continue; - } - - /* Check only if we have an active ping for this instance. */ - if (node->ping_sent == 0) continue; - - /* Check if this node looks unreachable. - * Note that if we already received the PONG, then node->ping_sent - * is zero, so can't reach this code at all, so we don't risk of - * checking for a PONG delay if we didn't sent the PING. - * - * We also consider every incoming data as proof of liveness, since - * our cluster bus link is also used for data: under heavy data - * load pong delays are possible. */ - mstime_t node_delay = (ping_delay < data_delay) ? ping_delay : - data_delay; - - if (node_delay > server.cluster_node_timeout) { - /* Timeout reached. Set the node as possibly failing if it is - * not already in this state. */ - if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) { - serverLog(LL_DEBUG,"*** NODE %.40s possibly failing", - node->name); - node->flags |= CLUSTER_NODE_PFAIL; - update_state = 1; - } - } - } - dictReleaseIterator(di); - - /* If we are a slave node but the replication is still turned off, - * enable it if we know the address of our master and it appears to - * be up. */ - if (nodeIsSlave(myself) && - server.masterhost == NULL && - myself->slaveof && - nodeHasAddr(myself->slaveof)) - { - replicationSetMaster(myself->slaveof->ip, getNodeDefaultReplicationPort(myself->slaveof)); - } - - /* Abort a manual failover if the timeout is reached. */ - manualFailoverCheckTimeout(); - - if (nodeIsSlave(myself)) { - clusterHandleManualFailover(); - if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) - clusterHandleSlaveFailover(); - /* If there are orphaned slaves, and we are a slave among the masters - * with the max number of non-failing slaves, consider migrating to - * the orphaned masters. Note that it does not make sense to try - * a migration if there is no master with at least *two* working - * slaves. */ - if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves && - server.cluster_allow_replica_migration) - clusterHandleSlaveMigration(max_slaves); - } - - if (update_state || server.cluster->state == CLUSTER_FAIL) - clusterUpdateState(); -} - -/* This function is called before the event handler returns to sleep for - * events. It is useful to perform operations that must be done ASAP in - * reaction to events fired but that are not safe to perform inside event - * handlers, or to perform potentially expansive tasks that we need to do - * a single time before replying to clients. */ -void clusterBeforeSleep(void) { - int flags = server.cluster->todo_before_sleep; - - /* Reset our flags (not strictly needed since every single function - * called for flags set should be able to clear its flag). */ - server.cluster->todo_before_sleep = 0; - - if (flags & CLUSTER_TODO_HANDLE_MANUALFAILOVER) { - /* Handle manual failover as soon as possible so that won't have a 100ms - * as it was handled only in clusterCron */ - if(nodeIsSlave(myself)) { - clusterHandleManualFailover(); - if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) - clusterHandleSlaveFailover(); - } - } else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) { - /* Handle failover, this is needed when it is likely that there is already - * the quorum from masters in order to react fast. */ - clusterHandleSlaveFailover(); - } - - /* Update the cluster state. */ - if (flags & CLUSTER_TODO_UPDATE_STATE) - clusterUpdateState(); - - /* Save the config, possibly using fsync. */ - if (flags & CLUSTER_TODO_SAVE_CONFIG) { - int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG; - clusterSaveConfigOrDie(fsync); - } -} - -void clusterDoBeforeSleep(int flags) { - server.cluster->todo_before_sleep |= flags; -} - -/* ----------------------------------------------------------------------------- - * Slots management - * -------------------------------------------------------------------------- */ - -/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is set, - * otherwise 0. */ -int bitmapTestBit(unsigned char *bitmap, int pos) { - off_t byte = pos/8; - int bit = pos&7; - return (bitmap[byte] & (1<nodes); - dictEntry *de; - int slaves = 0; - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (nodeIsSlave(node)) continue; - slaves += node->numslaves; - } - dictReleaseIterator(di); - return slaves != 0; -} - -/* Set the slot bit and return the old value. */ -int clusterNodeSetSlotBit(clusterNode *n, int slot) { - int old = bitmapTestBit(n->slots,slot); - if (!old) { - bitmapSetBit(n->slots,slot); - n->numslots++; - /* When a master gets its first slot, even if it has no slaves, - * it gets flagged with MIGRATE_TO, that is, the master is a valid - * target for replicas migration, if and only if at least one of - * the other masters has slaves right now. - * - * Normally masters are valid targets of replica migration if: - * 1. The used to have slaves (but no longer have). - * 2. They are slaves failing over a master that used to have slaves. - * - * However new masters with slots assigned are considered valid - * migration targets if the rest of the cluster is not a slave-less. - * - * See https://github.com/redis/redis/issues/3043 for more info. */ - if (n->numslots == 1 && clusterMastersHaveSlaves()) - n->flags |= CLUSTER_NODE_MIGRATE_TO; - } - return old; -} - -/* Clear the slot bit and return the old value. */ -int clusterNodeClearSlotBit(clusterNode *n, int slot) { - int old = bitmapTestBit(n->slots,slot); - if (old) { - bitmapClearBit(n->slots,slot); - n->numslots--; - } - return old; -} - -/* Return the slot bit from the cluster node structure. */ -int clusterNodeGetSlotBit(clusterNode *n, int slot) { - return bitmapTestBit(n->slots,slot); -} - -/* Add the specified slot to the list of slots that node 'n' will - * serve. Return C_OK if the operation ended with success. - * If the slot is already assigned to another instance this is considered - * an error and C_ERR is returned. */ -int clusterAddSlot(clusterNode *n, int slot) { - if (server.cluster->slots[slot]) return C_ERR; - clusterNodeSetSlotBit(n,slot); - server.cluster->slots[slot] = n; - return C_OK; -} - -/* Delete the specified slot marking it as unassigned. - * Returns C_OK if the slot was assigned, otherwise if the slot was - * already unassigned C_ERR is returned. */ -int clusterDelSlot(int slot) { - clusterNode *n = server.cluster->slots[slot]; - - if (!n) return C_ERR; - - /* Cleanup the channels in master/replica as part of slot deletion. */ - list *nodes_for_slot = clusterGetNodesInMyShard(n); - serverAssert(nodes_for_slot != NULL); - listNode *ln = listSearchKey(nodes_for_slot, myself); - if (ln != NULL) { - removeChannelsInSlot(slot); - } - serverAssert(clusterNodeClearSlotBit(n,slot) == 1); - server.cluster->slots[slot] = NULL; - return C_OK; -} - -/* Delete all the slots associated with the specified node. - * The number of deleted slots is returned. */ -int clusterDelNodeSlots(clusterNode *node) { - int deleted = 0, j; - - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (clusterNodeGetSlotBit(node,j)) { - clusterDelSlot(j); - deleted++; - } - } - return deleted; -} - -/* Clear the migrating / importing state for all the slots. - * This is useful at initialization and when turning a master into slave. */ -void clusterCloseAllSlots(void) { - memset(server.cluster->migrating_slots_to,0, - sizeof(server.cluster->migrating_slots_to)); - memset(server.cluster->importing_slots_from,0, - sizeof(server.cluster->importing_slots_from)); -} - -/* ----------------------------------------------------------------------------- - * Cluster state evaluation function - * -------------------------------------------------------------------------- */ - -/* The following are defines that are only used in the evaluation function - * and are based on heuristics. Actually the main point about the rejoin and - * writable delay is that they should be a few orders of magnitude larger - * than the network latency. */ -#define CLUSTER_MAX_REJOIN_DELAY 5000 -#define CLUSTER_MIN_REJOIN_DELAY 500 -#define CLUSTER_WRITABLE_DELAY 2000 - -void clusterUpdateState(void) { - int j, new_state; - int reachable_masters = 0; - static mstime_t among_minority_time; - static mstime_t first_call_time = 0; - - server.cluster->todo_before_sleep &= ~CLUSTER_TODO_UPDATE_STATE; - - /* If this is a master node, wait some time before turning the state - * into OK, since it is not a good idea to rejoin the cluster as a writable - * master, after a reboot, without giving the cluster a chance to - * reconfigure this node. Note that the delay is calculated starting from - * the first call to this function and not since the server start, in order - * to not count the DB loading time. */ - if (first_call_time == 0) first_call_time = mstime(); - if (nodeIsMaster(myself) && - server.cluster->state == CLUSTER_FAIL && - mstime() - first_call_time < CLUSTER_WRITABLE_DELAY) return; - - /* Start assuming the state is OK. We'll turn it into FAIL if there - * are the right conditions. */ - new_state = CLUSTER_OK; - - /* Check if all the slots are covered. */ - if (server.cluster_require_full_coverage) { - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (server.cluster->slots[j] == NULL || - server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) - { - new_state = CLUSTER_FAIL; - break; - } - } - } - - /* Compute the cluster size, that is the number of master nodes - * serving at least a single slot. - * - * At the same time count the number of reachable masters having - * at least one slot. */ - { - dictIterator *di; - dictEntry *de; - - server.cluster->size = 0; - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (nodeIsMaster(node) && node->numslots) { - server.cluster->size++; - if ((node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) == 0) - reachable_masters++; - } - } - dictReleaseIterator(di); - } - - /* If we are in a minority partition, change the cluster state - * to FAIL. */ - { - int needed_quorum = (server.cluster->size / 2) + 1; - - if (reachable_masters < needed_quorum) { - new_state = CLUSTER_FAIL; - among_minority_time = mstime(); - } - } - - /* Log a state change */ - if (new_state != server.cluster->state) { - mstime_t rejoin_delay = server.cluster_node_timeout; - - /* If the instance is a master and was partitioned away with the - * minority, don't let it accept queries for some time after the - * partition heals, to make sure there is enough time to receive - * a configuration update. */ - if (rejoin_delay > CLUSTER_MAX_REJOIN_DELAY) - rejoin_delay = CLUSTER_MAX_REJOIN_DELAY; - if (rejoin_delay < CLUSTER_MIN_REJOIN_DELAY) - rejoin_delay = CLUSTER_MIN_REJOIN_DELAY; - - if (new_state == CLUSTER_OK && - nodeIsMaster(myself) && - mstime() - among_minority_time < rejoin_delay) - { - return; - } - - /* Change the state and log the event. */ - serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, - "Cluster state changed: %s", - new_state == CLUSTER_OK ? "ok" : "fail"); - server.cluster->state = new_state; - } -} - -/* This function is called after the node startup in order to verify that data - * loaded from disk is in agreement with the cluster configuration: - * - * 1) If we find keys about hash slots we have no responsibility for, the - * following happens: - * A) If no other node is in charge according to the current cluster - * configuration, we add these slots to our node. - * B) If according to our config other nodes are already in charge for - * this slots, we set the slots as IMPORTING from our point of view - * in order to justify we have those slots, and in order to make - * redis-cli aware of the issue, so that it can try to fix it. - * 2) If we find data in a DB different than DB0 we return C_ERR to - * signal the caller it should quit the server with an error message - * or take other actions. - * - * The function always returns C_OK even if it will try to correct - * the error described in "1". However if data is found in DB different - * from DB0, C_ERR is returned. - * - * The function also uses the logging facility in order to warn the user - * about desynchronizations between the data we have in memory and the - * cluster configuration. */ -int verifyClusterConfigWithData(void) { - int j; - int update_config = 0; - - /* Return ASAP if a module disabled cluster redirections. In that case - * every master can store keys about every possible hash slot. */ - if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) - return C_OK; - - /* If this node is a slave, don't perform the check at all as we - * completely depend on the replication stream. */ - if (nodeIsSlave(myself)) return C_OK; - - /* Make sure we only have keys in DB0. */ - for (j = 1; j < server.dbnum; j++) { - if (dictSize(server.db[j].dict)) return C_ERR; - } - - /* Check that all the slots we see populated memory have a corresponding - * entry in the cluster table. Otherwise fix the table. */ - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (!countKeysInSlot(j)) continue; /* No keys in this slot. */ - /* Check if we are assigned to this slot or if we are importing it. - * In both cases check the next slot as the configuration makes - * sense. */ - if (server.cluster->slots[j] == myself || - server.cluster->importing_slots_from[j] != NULL) continue; - - /* If we are here data and cluster config don't agree, and we have - * slot 'j' populated even if we are not importing it, nor we are - * assigned to this slot. Fix this condition. */ - - update_config++; - /* Case A: slot is unassigned. Take responsibility for it. */ - if (server.cluster->slots[j] == NULL) { - serverLog(LL_NOTICE, "I have keys for unassigned slot %d. " - "Taking responsibility for it.",j); - clusterAddSlot(myself,j); - } else { - serverLog(LL_NOTICE, "I have keys for slot %d, but the slot is " - "assigned to another node. " - "Setting it to importing state.",j); - server.cluster->importing_slots_from[j] = server.cluster->slots[j]; - } - } - if (update_config) clusterSaveConfigOrDie(1); - return C_OK; -} - -/* ----------------------------------------------------------------------------- - * SLAVE nodes handling - * -------------------------------------------------------------------------- */ - -/* Set the specified node 'n' as master for this node. - * If this node is currently a master, it is turned into a slave. */ -void clusterSetMaster(clusterNode *n) { - serverAssert(n != myself); - serverAssert(myself->numslots == 0); - - if (nodeIsMaster(myself)) { - myself->flags &= ~(CLUSTER_NODE_MASTER|CLUSTER_NODE_MIGRATE_TO); - myself->flags |= CLUSTER_NODE_SLAVE; - clusterCloseAllSlots(); - } else { - if (myself->slaveof) - clusterNodeRemoveSlave(myself->slaveof,myself); - } - myself->slaveof = n; - updateShardId(myself, n->shard_id); - clusterNodeAddSlave(n,myself); - replicationSetMaster(n->ip, getNodeDefaultReplicationPort(n)); - resetManualFailover(); -} - -/* ----------------------------------------------------------------------------- - * Nodes to string representation functions. - * -------------------------------------------------------------------------- */ - -struct redisNodeFlags { - uint16_t flag; - char *name; -}; - -static struct redisNodeFlags redisNodeFlagsTable[] = { - {CLUSTER_NODE_MYSELF, "myself,"}, - {CLUSTER_NODE_MASTER, "master,"}, - {CLUSTER_NODE_SLAVE, "slave,"}, - {CLUSTER_NODE_PFAIL, "fail?,"}, - {CLUSTER_NODE_FAIL, "fail,"}, - {CLUSTER_NODE_HANDSHAKE, "handshake,"}, - {CLUSTER_NODE_NOADDR, "noaddr,"}, - {CLUSTER_NODE_NOFAILOVER, "nofailover,"} -}; - -/* Concatenate the comma separated list of node flags to the given SDS - * string 'ci'. */ -sds representClusterNodeFlags(sds ci, uint16_t flags) { - size_t orig_len = sdslen(ci); - int i, size = sizeof(redisNodeFlagsTable)/sizeof(struct redisNodeFlags); - for (i = 0; i < size; i++) { - struct redisNodeFlags *nodeflag = redisNodeFlagsTable + i; - if (flags & nodeflag->flag) ci = sdscat(ci, nodeflag->name); - } - /* If no flag was added, add the "noflags" special flag. */ - if (sdslen(ci) == orig_len) ci = sdscat(ci,"noflags,"); - sdsIncrLen(ci,-1); /* Remove trailing comma. */ - return ci; -} - -/* Concatenate the slot ownership information to the given SDS string 'ci'. - * If the slot ownership is in a contiguous block, it's represented as start-end pair, - * else each slot is added separately. */ -sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count) { - for (int i = 0; i< slot_info_pairs_count; i+=2) { - unsigned long start = slot_info_pairs[i]; - unsigned long end = slot_info_pairs[i+1]; - if (start == end) { - ci = sdscatfmt(ci, " %i", start); - } else { - ci = sdscatfmt(ci, " %i-%i", start, end); - } - } - return ci; -} - -/* Generate a csv-alike representation of the specified cluster node. - * See clusterGenNodesDescription() top comment for more information. - * - * The function returns the string representation as an SDS string. */ -sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary) { - int j, start; - sds ci; - int port = getNodeClientPort(node, tls_primary); - - /* Node coordinates */ - ci = sdscatlen(sdsempty(),node->name,CLUSTER_NAMELEN); - ci = sdscatfmt(ci," %s:%i@%i", - node->ip, - port, - node->cport); - if (sdslen(node->hostname) != 0) { - ci = sdscatfmt(ci,",%s", node->hostname); - } - if (sdslen(node->hostname) == 0) { - ci = sdscatfmt(ci,",", 1); - } - /* Don't expose aux fields to any clients yet but do allow them - * to be persisted to nodes.conf */ - if (c == NULL) { - for (int i = af_count-1; i >=0; i--) { - if ((tls_primary && i == af_tls_port) || (!tls_primary && i == af_tcp_port)) { - continue; - } - if (auxFieldHandlers[i].isPresent(node)) { - ci = sdscatprintf(ci, ",%s=", auxFieldHandlers[i].field); - ci = auxFieldHandlers[i].getter(node, ci); - } - } - } - - /* Flags */ - ci = sdscatlen(ci," ",1); - ci = representClusterNodeFlags(ci, node->flags); - - /* Slave of... or just "-" */ - ci = sdscatlen(ci," ",1); - if (node->slaveof) - ci = sdscatlen(ci,node->slaveof->name,CLUSTER_NAMELEN); - else - ci = sdscatlen(ci,"-",1); - - unsigned long long nodeEpoch = node->configEpoch; - if (nodeIsSlave(node) && node->slaveof) { - nodeEpoch = node->slaveof->configEpoch; - } - /* Latency from the POV of this node, config epoch, link status */ - ci = sdscatfmt(ci," %I %I %U %s", - (long long) node->ping_sent, - (long long) node->pong_received, - nodeEpoch, - (node->link || node->flags & CLUSTER_NODE_MYSELF) ? - "connected" : "disconnected"); - - /* Slots served by this instance. If we already have slots info, - * append it directly, otherwise, generate slots only if it has. */ - if (node->slot_info_pairs) { - ci = representSlotInfo(ci, node->slot_info_pairs, node->slot_info_pairs_count); - } else if (node->numslots > 0) { - start = -1; - for (j = 0; j < CLUSTER_SLOTS; j++) { - int bit; - - if ((bit = clusterNodeGetSlotBit(node,j)) != 0) { - if (start == -1) start = j; - } - if (start != -1 && (!bit || j == CLUSTER_SLOTS-1)) { - if (bit && j == CLUSTER_SLOTS-1) j++; - - if (start == j-1) { - ci = sdscatfmt(ci," %i",start); - } else { - ci = sdscatfmt(ci," %i-%i",start,j-1); - } - start = -1; - } - } - } - - /* Just for MYSELF node we also dump info about slots that - * we are migrating to other instances or importing from other - * instances. */ - if (node->flags & CLUSTER_NODE_MYSELF) { - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (server.cluster->migrating_slots_to[j]) { - ci = sdscatprintf(ci," [%d->-%.40s]",j, - server.cluster->migrating_slots_to[j]->name); - } else if (server.cluster->importing_slots_from[j]) { - ci = sdscatprintf(ci," [%d-<-%.40s]",j, - server.cluster->importing_slots_from[j]->name); - } - } - } - return ci; -} - -/* Generate the slot topology for all nodes and store the string representation - * in the slots_info struct on the node. This is used to improve the efficiency - * of clusterGenNodesDescription() because it removes looping of the slot space - * for generating the slot info for each node individually. */ -void clusterGenNodesSlotsInfo(int filter) { - clusterNode *n = NULL; - int start = -1; - - for (int i = 0; i <= CLUSTER_SLOTS; i++) { - /* Find start node and slot id. */ - if (n == NULL) { - if (i == CLUSTER_SLOTS) break; - n = server.cluster->slots[i]; - start = i; - continue; - } - - /* Generate slots info when occur different node with start - * or end of slot. */ - if (i == CLUSTER_SLOTS || n != server.cluster->slots[i]) { - if (!(n->flags & filter)) { - if (!n->slot_info_pairs) { - n->slot_info_pairs = zmalloc(2 * n->numslots * sizeof(uint16_t)); - } - serverAssert((n->slot_info_pairs_count + 1) < (2 * n->numslots)); - n->slot_info_pairs[n->slot_info_pairs_count++] = start; - n->slot_info_pairs[n->slot_info_pairs_count++] = i-1; - } - if (i == CLUSTER_SLOTS) break; - n = server.cluster->slots[i]; - start = i; - } - } -} - -void clusterFreeNodesSlotsInfo(clusterNode *n) { - zfree(n->slot_info_pairs); - n->slot_info_pairs = NULL; - n->slot_info_pairs_count = 0; -} - -/* Generate a csv-alike representation of the nodes we are aware of, - * including the "myself" node, and return an SDS string containing the - * representation (it is up to the caller to free it). - * - * All the nodes matching at least one of the node flags specified in - * "filter" are excluded from the output, so using zero as a filter will - * include all the known nodes in the representation, including nodes in - * the HANDSHAKE state. - * - * Setting tls_primary to 1 to put TLS port in the main : - * field and put TCP port in aux field, instead of the opposite way. - * - * The representation obtained using this function is used for the output - * of the CLUSTER NODES function, and as format for the cluster - * configuration file (nodes.conf) for a given node. */ -sds clusterGenNodesDescription(client *c, int filter, int tls_primary) { - sds ci = sdsempty(), ni; - dictIterator *di; - dictEntry *de; - - /* Generate all nodes slots info firstly. */ - clusterGenNodesSlotsInfo(filter); - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - - if (node->flags & filter) continue; - ni = clusterGenNodeDescription(c, node, tls_primary); - ci = sdscatsds(ci,ni); - sdsfree(ni); - ci = sdscatlen(ci,"\n",1); - - /* Release slots info. */ - clusterFreeNodesSlotsInfo(node); - } - dictReleaseIterator(di); - return ci; -} - -/* Add to the output buffer of the given client the description of the given cluster link. - * The description is a map with each entry being an attribute of the link. */ -void addReplyClusterLinkDescription(client *c, clusterLink *link) { - addReplyMapLen(c, 6); - - addReplyBulkCString(c, "direction"); - addReplyBulkCString(c, link->inbound ? "from" : "to"); - - /* addReplyClusterLinkDescription is only called for links that have been - * associated with nodes. The association is always bi-directional, so - * in addReplyClusterLinkDescription, link->node should never be NULL. */ - serverAssert(link->node); - sds node_name = sdsnewlen(link->node->name, CLUSTER_NAMELEN); - addReplyBulkCString(c, "node"); - addReplyBulkCString(c, node_name); - sdsfree(node_name); - - addReplyBulkCString(c, "create-time"); - addReplyLongLong(c, link->ctime); - - char events[3], *p; - p = events; - if (link->conn) { - if (connHasReadHandler(link->conn)) *p++ = 'r'; - if (connHasWriteHandler(link->conn)) *p++ = 'w'; - } - *p = '\0'; - addReplyBulkCString(c, "events"); - addReplyBulkCString(c, events); - - addReplyBulkCString(c, "send-buffer-allocated"); - addReplyLongLong(c, link->send_msg_queue_mem); - - addReplyBulkCString(c, "send-buffer-used"); - addReplyLongLong(c, link->send_msg_queue_mem); -} - -/* Add to the output buffer of the given client an array of cluster link descriptions, - * with array entry being a description of a single current cluster link. */ -void addReplyClusterLinksDescription(client *c) { - dictIterator *di; - dictEntry *de; - void *arraylen_ptr = NULL; - int num_links = 0; - - arraylen_ptr = addReplyDeferredLen(c); - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - if (node->link) { - num_links++; - addReplyClusterLinkDescription(c, node->link); - } - if (node->inbound_link) { - num_links++; - addReplyClusterLinkDescription(c, node->inbound_link); - } - } - dictReleaseIterator(di); - - setDeferredArrayLen(c, arraylen_ptr, num_links); -} - -/* ----------------------------------------------------------------------------- - * CLUSTER command - * -------------------------------------------------------------------------- */ - -const char *getPreferredEndpoint(clusterNode *n) { - switch(server.cluster_preferred_endpoint_type) { - case CLUSTER_ENDPOINT_TYPE_IP: return n->ip; - case CLUSTER_ENDPOINT_TYPE_HOSTNAME: return (sdslen(n->hostname) != 0) ? n->hostname : "?"; - case CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT: return ""; - } - return "unknown"; -} - -const char *clusterGetMessageTypeString(int type) { - switch(type) { - case CLUSTERMSG_TYPE_PING: return "ping"; - case CLUSTERMSG_TYPE_PONG: return "pong"; - case CLUSTERMSG_TYPE_MEET: return "meet"; - case CLUSTERMSG_TYPE_FAIL: return "fail"; - case CLUSTERMSG_TYPE_PUBLISH: return "publish"; - case CLUSTERMSG_TYPE_PUBLISHSHARD: return "publishshard"; - case CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST: return "auth-req"; - case CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK: return "auth-ack"; - case CLUSTERMSG_TYPE_UPDATE: return "update"; - case CLUSTERMSG_TYPE_MFSTART: return "mfstart"; - case CLUSTERMSG_TYPE_MODULE: return "module"; - } - return "unknown"; -} - -int getSlotOrReply(client *c, robj *o) { - long long slot; - - if (getLongLongFromObject(o,&slot) != C_OK || - slot < 0 || slot >= CLUSTER_SLOTS) - { - addReplyError(c,"Invalid or out of range slot"); - return -1; - } - return (int) slot; -} - -/* Returns an indication if the replica node is fully available - * and should be listed in CLUSTER SLOTS response. - * Returns 1 for available nodes, 0 for nodes that have - * not finished their initial sync, in failed state, or are - * otherwise considered not available to serve read commands. */ -static int isReplicaAvailable(clusterNode *node) { - if (nodeFailed(node)) { - return 0; - } - long long repl_offset = node->repl_offset; - if (node->flags & CLUSTER_NODE_MYSELF) { - /* Nodes do not update their own information - * in the cluster node list. */ - repl_offset = replicationGetSlaveOffset(); - } - return (repl_offset != 0); -} - -int checkSlotAssignmentsOrReply(client *c, unsigned char *slots, int del, int start_slot, int end_slot) { - int slot; - for (slot = start_slot; slot <= end_slot; slot++) { - if (del && server.cluster->slots[slot] == NULL) { - addReplyErrorFormat(c,"Slot %d is already unassigned", slot); - return C_ERR; - } else if (!del && server.cluster->slots[slot]) { - addReplyErrorFormat(c,"Slot %d is already busy", slot); - return C_ERR; - } - if (slots[slot]++ == 1) { - addReplyErrorFormat(c,"Slot %d specified multiple times",(int)slot); - return C_ERR; - } - } - return C_OK; -} - -void clusterUpdateSlots(client *c, unsigned char *slots, int del) { - int j; - for (j = 0; j < CLUSTER_SLOTS; j++) { - if (slots[j]) { - int retval; - - /* If this slot was set as importing we can clear this - * state as now we are the real owner of the slot. */ - if (server.cluster->importing_slots_from[j]) - server.cluster->importing_slots_from[j] = NULL; - - retval = del ? clusterDelSlot(j) : - clusterAddSlot(myself,j); - serverAssertWithInfo(c,NULL,retval == C_OK); - } - } -} - -void addNodeToNodeReply(client *c, clusterNode *node) { - addReplyArrayLen(c, 4); - if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_IP) { - addReplyBulkCString(c, node->ip); - } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_HOSTNAME) { - if (sdslen(node->hostname) != 0) { - addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname)); - } else { - addReplyBulkCString(c, "?"); - } - } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT) { - addReplyNull(c); - } else { - serverPanic("Unrecognized preferred endpoint type"); - } - - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - addReplyLongLong(c, getNodeClientPort(node, connIsTLS(c->conn))); - addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN); - - /* Add the additional endpoint information, this is all the known networking information - * that is not the preferred endpoint. Note the logic is evaluated twice so we can - * correctly report the number of additional network arguments without using a deferred - * map, an assertion is made at the end to check we set the right length. */ - int length = 0; - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { - length++; - } - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME - && sdslen(node->hostname) != 0) - { - length++; - } - addReplyMapLen(c, length); - - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { - addReplyBulkCString(c, "ip"); - addReplyBulkCString(c, node->ip); - length--; - } - if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME - && sdslen(node->hostname) != 0) - { - addReplyBulkCString(c, "hostname"); - addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname)); - length--; - } - serverAssert(length == 0); -} - -void addNodeReplyForClusterSlot(client *c, clusterNode *node, int start_slot, int end_slot) { - int i, nested_elements = 3; /* slots (2) + master addr (1) */ - for (i = 0; i < node->numslaves; i++) { - if (!isReplicaAvailable(node->slaves[i])) continue; - nested_elements++; - } - addReplyArrayLen(c, nested_elements); - addReplyLongLong(c, start_slot); - addReplyLongLong(c, end_slot); - addNodeToNodeReply(c, node); - - /* Remaining nodes in reply are replicas for slot range */ - for (i = 0; i < node->numslaves; i++) { - /* This loop is copy/pasted from clusterGenNodeDescription() - * with modifications for per-slot node aggregation. */ - if (!isReplicaAvailable(node->slaves[i])) continue; - addNodeToNodeReply(c, node->slaves[i]); - nested_elements--; - } - serverAssert(nested_elements == 3); /* Original 3 elements */ -} - -/* Add detailed information of a node to the output buffer of the given client. */ -void addNodeDetailsToShardReply(client *c, clusterNode *node) { - int reply_count = 0; - void *node_replylen = addReplyDeferredLen(c); - addReplyBulkCString(c, "id"); - addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN); - reply_count++; - - if (node->tcp_port) { - addReplyBulkCString(c, "port"); - addReplyLongLong(c, node->tcp_port); - reply_count++; - } - - if (node->tls_port) { - addReplyBulkCString(c, "tls-port"); - addReplyLongLong(c, node->tls_port); - reply_count++; - } - - addReplyBulkCString(c, "ip"); - addReplyBulkCString(c, node->ip); - reply_count++; - - addReplyBulkCString(c, "endpoint"); - addReplyBulkCString(c, getPreferredEndpoint(node)); - reply_count++; - - if (sdslen(node->hostname) != 0) { - addReplyBulkCString(c, "hostname"); - addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname)); - reply_count++; - } - - long long node_offset; - if (node->flags & CLUSTER_NODE_MYSELF) { - node_offset = nodeIsSlave(node) ? replicationGetSlaveOffset() : server.master_repl_offset; - } else { - node_offset = node->repl_offset; - } - - addReplyBulkCString(c, "role"); - addReplyBulkCString(c, nodeIsSlave(node) ? "replica" : "master"); - reply_count++; - - addReplyBulkCString(c, "replication-offset"); - addReplyLongLong(c, node_offset); - reply_count++; - - addReplyBulkCString(c, "health"); - const char *health_msg = NULL; - if (nodeFailed(node)) { - health_msg = "fail"; - } else if (nodeIsSlave(node) && node_offset == 0) { - health_msg = "loading"; - } else { - health_msg = "online"; - } - addReplyBulkCString(c, health_msg); - reply_count++; - - setDeferredMapLen(c, node_replylen, reply_count); -} - -/* Add the shard reply of a single shard based off the given primary node. */ -void addShardReplyForClusterShards(client *c, list *nodes) { - serverAssert(listLength(nodes) > 0); - clusterNode *n = listNodeValue(listFirst(nodes)); - addReplyMapLen(c, 2); - addReplyBulkCString(c, "slots"); - - /* Use slot_info_pairs from the primary only */ - while (n->slaveof != NULL) n = n->slaveof; - - if (n->slot_info_pairs != NULL) { - serverAssert((n->slot_info_pairs_count % 2) == 0); - addReplyArrayLen(c, n->slot_info_pairs_count); - for (int i = 0; i < n->slot_info_pairs_count; i++) - addReplyBulkLongLong(c, (unsigned long)n->slot_info_pairs[i]); - } else { - /* If no slot info pair is provided, the node owns no slots */ - addReplyArrayLen(c, 0); - } - - addReplyBulkCString(c, "nodes"); - addReplyArrayLen(c, listLength(nodes)); - listIter li; - listRewind(nodes, &li); - for (listNode *ln = listNext(&li); ln != NULL; ln = listNext(&li)) { - clusterNode *n = listNodeValue(ln); - addNodeDetailsToShardReply(c, n); - clusterFreeNodesSlotsInfo(n); - } -} - -/* Add to the output buffer of the given client, an array of slot (start, end) - * pair owned by the shard, also the primary and set of replica(s) along with - * information about each node. */ -void clusterReplyShards(client *c) { - addReplyArrayLen(c, dictSize(server.cluster->shards)); - /* This call will add slot_info_pairs to all nodes */ - clusterGenNodesSlotsInfo(0); - dictIterator *di = dictGetSafeIterator(server.cluster->shards); - for(dictEntry *de = dictNext(di); de != NULL; de = dictNext(di)) { - addShardReplyForClusterShards(c, dictGetVal(de)); - } - dictReleaseIterator(di); -} - -void clusterReplyMultiBulkSlots(client * c) { - /* Format: 1) 1) start slot - * 2) end slot - * 3) 1) master IP - * 2) master port - * 3) node ID - * 4) 1) replica IP - * 2) replica port - * 3) node ID - * ... continued until done - */ - clusterNode *n = NULL; - int num_masters = 0, start = -1; - void *slot_replylen = addReplyDeferredLen(c); - - for (int i = 0; i <= CLUSTER_SLOTS; i++) { - /* Find start node and slot id. */ - if (n == NULL) { - if (i == CLUSTER_SLOTS) break; - n = server.cluster->slots[i]; - start = i; - continue; - } - - /* Add cluster slots info when occur different node with start - * or end of slot. */ - if (i == CLUSTER_SLOTS || n != server.cluster->slots[i]) { - addNodeReplyForClusterSlot(c, n, start, i-1); - num_masters++; - if (i == CLUSTER_SLOTS) break; - n = server.cluster->slots[i]; - start = i; - } - } - setDeferredArrayLen(c, slot_replylen, num_masters); -} - -sds genClusterInfoString(void) { - sds info = sdsempty(); - char *statestr[] = {"ok","fail"}; - int slots_assigned = 0, slots_ok = 0, slots_pfail = 0, slots_fail = 0; - uint64_t myepoch; - int j; - - for (j = 0; j < CLUSTER_SLOTS; j++) { - clusterNode *n = server.cluster->slots[j]; - - if (n == NULL) continue; - slots_assigned++; - if (nodeFailed(n)) { - slots_fail++; - } else if (nodeTimedOut(n)) { - slots_pfail++; - } else { - slots_ok++; - } - } - - myepoch = (nodeIsSlave(myself) && myself->slaveof) ? - myself->slaveof->configEpoch : myself->configEpoch; - - info = sdscatprintf(info, - "cluster_state:%s\r\n" - "cluster_slots_assigned:%d\r\n" - "cluster_slots_ok:%d\r\n" - "cluster_slots_pfail:%d\r\n" - "cluster_slots_fail:%d\r\n" - "cluster_known_nodes:%lu\r\n" - "cluster_size:%d\r\n" - "cluster_current_epoch:%llu\r\n" - "cluster_my_epoch:%llu\r\n" - , statestr[server.cluster->state], - slots_assigned, - slots_ok, - slots_pfail, - slots_fail, - dictSize(server.cluster->nodes), - server.cluster->size, - (unsigned long long) server.cluster->currentEpoch, - (unsigned long long) myepoch - ); - - /* Show stats about messages sent and received. */ - long long tot_msg_sent = 0; - long long tot_msg_received = 0; - - for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { - if (server.cluster->stats_bus_messages_sent[i] == 0) continue; - tot_msg_sent += server.cluster->stats_bus_messages_sent[i]; - info = sdscatprintf(info, - "cluster_stats_messages_%s_sent:%lld\r\n", - clusterGetMessageTypeString(i), - server.cluster->stats_bus_messages_sent[i]); - } - info = sdscatprintf(info, - "cluster_stats_messages_sent:%lld\r\n", tot_msg_sent); - - for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { - if (server.cluster->stats_bus_messages_received[i] == 0) continue; - tot_msg_received += server.cluster->stats_bus_messages_received[i]; - info = sdscatprintf(info, - "cluster_stats_messages_%s_received:%lld\r\n", - clusterGetMessageTypeString(i), - server.cluster->stats_bus_messages_received[i]); - } - info = sdscatprintf(info, - "cluster_stats_messages_received:%lld\r\n", tot_msg_received); - - info = sdscatprintf(info, - "total_cluster_links_buffer_limit_exceeded:%llu\r\n", - server.cluster->stat_cluster_links_buffer_limit_exceeded); - - return info; -} - -void clusterCommand(client *c) { - if (server.cluster_enabled == 0) { - addReplyError(c,"This instance has cluster support disabled"); - return; - } - - if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) { - const char *help[] = { -"ADDSLOTS [ ...]", -" Assign slots to current node.", -"ADDSLOTSRANGE [ ...]", -" Assign slots which are between and to current node.", -"BUMPEPOCH", -" Advance the cluster config epoch.", -"COUNT-FAILURE-REPORTS ", -" Return number of failure reports for .", -"COUNTKEYSINSLOT ", -" Return the number of keys in .", -"DELSLOTS [ ...]", -" Delete slots information from current node.", -"DELSLOTSRANGE [ ...]", -" Delete slots information which are between and from current node.", -"FAILOVER [FORCE|TAKEOVER]", -" Promote current replica node to being a master.", -"FORGET ", -" Remove a node from the cluster.", -"GETKEYSINSLOT ", -" Return key names stored by current node in a slot.", -"FLUSHSLOTS", -" Delete current node own slots information.", -"INFO", -" Return information about the cluster.", -"KEYSLOT ", -" Return the hash slot for .", -"MEET []", -" Connect nodes into a working cluster.", -"MYID", -" Return the node id.", -"MYSHARDID", -" Return the node's shard id.", -"NODES", -" Return cluster configuration seen by node. Output format:", -" ...", -"REPLICATE ", -" Configure current node as replica to .", -"RESET [HARD|SOFT]", -" Reset current node (default: soft).", -"SET-CONFIG-EPOCH ", -" Set config epoch of current node.", -"SETSLOT (IMPORTING |MIGRATING |STABLE|NODE )", -" Set slot state.", -"REPLICAS ", -" Return replicas.", -"SAVECONFIG", -" Force saving cluster configuration on disk.", -"SLOTS", -" Return information about slots range mappings. Each range is made of:", -" start, end, master and replicas IP addresses, ports and ids", -"SHARDS", -" Return information about slot range mappings and the nodes associated with them.", -"LINKS", -" Return information about all network links between this node and its peers.", -" Output format is an array where each array element is a map containing attributes of a link", -NULL - }; - addReplyHelp(c, help); - } else if (!strcasecmp(c->argv[1]->ptr,"meet") && (c->argc == 4 || c->argc == 5)) { - /* CLUSTER MEET [cport] */ - long long port, cport; - - if (getLongLongFromObject(c->argv[3], &port) != C_OK) { - addReplyErrorFormat(c,"Invalid base port specified: %s", - (char*)c->argv[3]->ptr); - return; - } - - if (c->argc == 5) { - if (getLongLongFromObject(c->argv[4], &cport) != C_OK) { - addReplyErrorFormat(c,"Invalid bus port specified: %s", - (char*)c->argv[4]->ptr); - return; - } - } else { - cport = port + CLUSTER_PORT_INCR; - } - - if (clusterStartHandshake(c->argv[2]->ptr,port,cport) == 0 && - errno == EINVAL) - { - addReplyErrorFormat(c,"Invalid node address specified: %s:%s", - (char*)c->argv[2]->ptr, (char*)c->argv[3]->ptr); - } else { - addReply(c,shared.ok); - } - } else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) { - /* CLUSTER NODES */ - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - sds nodes = clusterGenNodesDescription(c, 0, connIsTLS(c->conn)); - addReplyVerbatim(c,nodes,sdslen(nodes),"txt"); - sdsfree(nodes); - } else if (!strcasecmp(c->argv[1]->ptr,"myid") && c->argc == 2) { - /* CLUSTER MYID */ - addReplyBulkCBuffer(c,myself->name, CLUSTER_NAMELEN); - } else if (!strcasecmp(c->argv[1]->ptr,"myshardid") && c->argc == 2) { - /* CLUSTER MYSHARDID */ - addReplyBulkCBuffer(c,myself->shard_id, CLUSTER_NAMELEN); - } else if (!strcasecmp(c->argv[1]->ptr,"slots") && c->argc == 2) { - /* CLUSTER SLOTS */ - clusterReplyMultiBulkSlots(c); - } else if (!strcasecmp(c->argv[1]->ptr,"shards") && c->argc == 2) { - /* CLUSTER SHARDS */ - clusterReplyShards(c); - } else if (!strcasecmp(c->argv[1]->ptr,"flushslots") && c->argc == 2) { - /* CLUSTER FLUSHSLOTS */ - if (dictSize(server.db[0].dict) != 0) { - addReplyError(c,"DB must be empty to perform CLUSTER FLUSHSLOTS."); - return; - } - clusterDelNodeSlots(myself); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if ((!strcasecmp(c->argv[1]->ptr,"addslots") || - !strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3) - { - /* CLUSTER ADDSLOTS [slot] ... */ - /* CLUSTER DELSLOTS [slot] ... */ - int j, slot; - unsigned char *slots = zmalloc(CLUSTER_SLOTS); - int del = !strcasecmp(c->argv[1]->ptr,"delslots"); - - memset(slots,0,CLUSTER_SLOTS); - /* Check that all the arguments are parseable.*/ - for (j = 2; j < c->argc; j++) { - if ((slot = getSlotOrReply(c,c->argv[j])) == C_ERR) { - zfree(slots); - return; - } - } - /* Check that the slots are not already busy. */ - for (j = 2; j < c->argc; j++) { - slot = getSlotOrReply(c,c->argv[j]); - if (checkSlotAssignmentsOrReply(c, slots, del, slot, slot) == C_ERR) { - zfree(slots); - return; - } - } - clusterUpdateSlots(c, slots, del); - zfree(slots); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if ((!strcasecmp(c->argv[1]->ptr,"addslotsrange") || - !strcasecmp(c->argv[1]->ptr,"delslotsrange")) && c->argc >= 4) { - if (c->argc % 2 == 1) { - addReplyErrorArity(c); - return; - } - /* CLUSTER ADDSLOTSRANGE [ ...] */ - /* CLUSTER DELSLOTSRANGE [ ...] */ - int j, startslot, endslot; - unsigned char *slots = zmalloc(CLUSTER_SLOTS); - int del = !strcasecmp(c->argv[1]->ptr,"delslotsrange"); - - memset(slots,0,CLUSTER_SLOTS); - /* Check that all the arguments are parseable and that all the - * slots are not already busy. */ - for (j = 2; j < c->argc; j += 2) { - if ((startslot = getSlotOrReply(c,c->argv[j])) == C_ERR) { - zfree(slots); - return; - } - if ((endslot = getSlotOrReply(c,c->argv[j+1])) == C_ERR) { - zfree(slots); - return; - } - if (startslot > endslot) { - addReplyErrorFormat(c,"start slot number %d is greater than end slot number %d", startslot, endslot); - zfree(slots); - return; - } - - if (checkSlotAssignmentsOrReply(c, slots, del, startslot, endslot) == C_ERR) { - zfree(slots); - return; - } - } - clusterUpdateSlots(c, slots, del); - zfree(slots); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) { - /* SETSLOT 10 MIGRATING */ - /* SETSLOT 10 IMPORTING */ - /* SETSLOT 10 STABLE */ - /* SETSLOT 10 NODE */ - int slot; - clusterNode *n; - - if (nodeIsSlave(myself)) { - addReplyError(c,"Please use SETSLOT only with masters."); - return; - } - - if ((slot = getSlotOrReply(c,c->argv[2])) == -1) return; - - if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) { - if (server.cluster->slots[slot] != myself) { - addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot); - return; - } - n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (n == NULL) { - addReplyErrorFormat(c,"I don't know about node %s", - (char*)c->argv[4]->ptr); - return; - } - if (nodeIsSlave(n)) { - addReplyError(c,"Target node is not a master"); - return; - } - server.cluster->migrating_slots_to[slot] = n; - } else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) { - if (server.cluster->slots[slot] == myself) { - addReplyErrorFormat(c, - "I'm already the owner of hash slot %u",slot); - return; - } - n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (n == NULL) { - addReplyErrorFormat(c,"I don't know about node %s", - (char*)c->argv[4]->ptr); - return; - } - if (nodeIsSlave(n)) { - addReplyError(c,"Target node is not a master"); - return; - } - server.cluster->importing_slots_from[slot] = n; - } else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) { - /* CLUSTER SETSLOT STABLE */ - server.cluster->importing_slots_from[slot] = NULL; - server.cluster->migrating_slots_to[slot] = NULL; - } else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) { - /* CLUSTER SETSLOT NODE */ - n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", - (char*)c->argv[4]->ptr); - return; - } - if (nodeIsSlave(n)) { - addReplyError(c,"Target node is not a master"); - return; - } - /* If this hash slot was served by 'myself' before to switch - * make sure there are no longer local keys for this hash slot. */ - if (server.cluster->slots[slot] == myself && n != myself) { - if (countKeysInSlot(slot) != 0) { - addReplyErrorFormat(c, - "Can't assign hashslot %d to a different node " - "while I still hold keys for this hash slot.", slot); - return; - } - } - /* If this slot is in migrating status but we have no keys - * for it assigning the slot to another node will clear - * the migrating status. */ - if (countKeysInSlot(slot) == 0 && - server.cluster->migrating_slots_to[slot]) - server.cluster->migrating_slots_to[slot] = NULL; - - int slot_was_mine = server.cluster->slots[slot] == myself; - clusterDelSlot(slot); - clusterAddSlot(n,slot); - - /* If we are a master left without slots, we should turn into a - * replica of the new master. */ - if (slot_was_mine && - n != myself && - myself->numslots == 0 && - server.cluster_allow_replica_migration) - { - serverLog(LL_NOTICE, - "Configuration change detected. Reconfiguring myself " - "as a replica of %.40s (%s)", n->name, n->human_nodename); - clusterSetMaster(n); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | - CLUSTER_TODO_UPDATE_STATE | - CLUSTER_TODO_FSYNC_CONFIG); - } - - /* If this node was importing this slot, assigning the slot to - * itself also clears the importing status. */ - if (n == myself && - server.cluster->importing_slots_from[slot]) - { - /* This slot was manually migrated, set this node configEpoch - * to a new epoch so that the new version can be propagated - * by the cluster. - * - * Note that if this ever results in a collision with another - * node getting the same configEpoch, for example because a - * failover happens at the same time we close the slot, the - * configEpoch collision resolution will fix it assigning - * a different epoch to each node. */ - if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { - serverLog(LL_NOTICE, - "configEpoch updated after importing slot %d", slot); - } - server.cluster->importing_slots_from[slot] = NULL; - /* After importing this slot, let the other nodes know as - * soon as possible. */ - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); - } - } else { - addReplyError(c, - "Invalid CLUSTER SETSLOT action or number of arguments. Try CLUSTER HELP"); - return; - } - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"bumpepoch") && c->argc == 2) { - /* CLUSTER BUMPEPOCH */ - int retval = clusterBumpConfigEpochWithoutConsensus(); - sds reply = sdscatprintf(sdsempty(),"+%s %llu\r\n", - (retval == C_OK) ? "BUMPED" : "STILL", - (unsigned long long) myself->configEpoch); - addReplySds(c,reply); - } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) { - /* CLUSTER INFO */ - - sds info = genClusterInfoString(); - - /* Produce the reply protocol. */ - addReplyVerbatim(c,info,sdslen(info),"txt"); - sdsfree(info); - } else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) { - int retval = clusterSaveConfig(1); - - if (retval == 0) - addReply(c,shared.ok); - else - addReplyErrorFormat(c,"error saving the cluster node config: %s", - strerror(errno)); - } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) { - /* CLUSTER KEYSLOT */ - sds key = c->argv[2]->ptr; - - addReplyLongLong(c,keyHashSlot(key,sdslen(key))); - } else if (!strcasecmp(c->argv[1]->ptr,"countkeysinslot") && c->argc == 3) { - /* CLUSTER COUNTKEYSINSLOT */ - long long slot; - - if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) - return; - if (slot < 0 || slot >= CLUSTER_SLOTS) { - addReplyError(c,"Invalid slot"); - return; - } - addReplyLongLong(c,countKeysInSlot(slot)); - } else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) { - /* CLUSTER GETKEYSINSLOT */ - long long maxkeys, slot; - - if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) - return; - if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) - != C_OK) - return; - if (slot < 0 || slot >= CLUSTER_SLOTS || maxkeys < 0) { - addReplyError(c,"Invalid slot or number of keys"); - return; - } - - unsigned int keys_in_slot = countKeysInSlot(slot); - unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys; - addReplyArrayLen(c,numkeys); - dictEntry *de = (*server.db->slots_to_keys).by_slot[slot].head; - for (unsigned int j = 0; j < numkeys; j++) { - serverAssert(de != NULL); - sds sdskey = dictGetKey(de); - addReplyBulkCBuffer(c, sdskey, sdslen(sdskey)); - de = dictEntryNextInSlot(de); - } - } else if (!strcasecmp(c->argv[1]->ptr,"forget") && c->argc == 3) { - /* CLUSTER FORGET */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); - if (!n) { - if (clusterBlacklistExists((char*)c->argv[2]->ptr)) - /* Already forgotten. The deletion may have been gossipped by - * another node, so we pretend it succeeded. */ - addReply(c,shared.ok); - else - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; - } else if (n == myself) { - addReplyError(c,"I tried hard but I can't forget myself..."); - return; - } else if (nodeIsSlave(myself) && myself->slaveof == n) { - addReplyError(c,"Can't forget my master!"); - return; - } - clusterBlacklistAddNode(n); - clusterDelNode(n); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) { - /* CLUSTER REPLICATE */ - /* Lookup the specified node in our table. */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; - } - - /* I can't replicate myself. */ - if (n == myself) { - addReplyError(c,"Can't replicate myself"); - return; - } +/* + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). + * + * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. + */ - /* Can't replicate a slave. */ - if (nodeIsSlave(n)) { - addReplyError(c,"I can only replicate a master, not a replica."); - return; - } +/* + * cluster.c contains the common parts of a clustering + * implementation, the parts that are shared between + * any implementation of Redis clustering. + */ - /* If the instance is currently a master, it should have no assigned - * slots nor keys to accept to replicate some other node. - * Slaves can switch to another master without issues. */ - if (nodeIsMaster(myself) && - (myself->numslots != 0 || dictSize(server.db[0].dict) != 0)) { - addReplyError(c, - "To set a master the node must be empty and " - "without assigned slots."); - return; - } +#include "server.h" +#include "cluster.h" - /* Set the master. */ - clusterSetMaster(n); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } else if ((!strcasecmp(c->argv[1]->ptr,"slaves") || - !strcasecmp(c->argv[1]->ptr,"replicas")) && c->argc == 3) { - /* CLUSTER SLAVES */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); - int j; +#include - /* Lookup the specified node in our table. */ - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; - } +/* ----------------------------------------------------------------------------- + * Key space handling + * -------------------------------------------------------------------------- */ - if (nodeIsSlave(n)) { - addReplyError(c,"The specified node is not a master"); - return; - } +/* We have 16384 hash slots. The hash slot of a given key is obtained + * as the least significant 14 bits of the crc16 of the key. + * + * However, if the key contains the {...} pattern, only the part between + * { and } is hashed. This may be useful in the future to force certain + * keys to be in the same node (assuming no resharding is in progress). */ +unsigned int keyHashSlot(char *key, int keylen) { + int s, e; /* start-end indexes of { and } */ - /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - addReplyArrayLen(c,n->numslaves); - for (j = 0; j < n->numslaves; j++) { - sds ni = clusterGenNodeDescription(c, n->slaves[j], connIsTLS(c->conn)); - addReplyBulkCString(c,ni); - sdsfree(ni); - } - } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") && - c->argc == 3) - { - /* CLUSTER COUNT-FAILURE-REPORTS */ - clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + for (s = 0; s < keylen; s++) + if (key[s] == '{') break; - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); - return; - } else { - addReplyLongLong(c,clusterNodeFailureReportsCount(n)); - } - } else if (!strcasecmp(c->argv[1]->ptr,"failover") && - (c->argc == 2 || c->argc == 3)) - { - /* CLUSTER FAILOVER [FORCE|TAKEOVER] */ - int force = 0, takeover = 0; - - if (c->argc == 3) { - if (!strcasecmp(c->argv[2]->ptr,"force")) { - force = 1; - } else if (!strcasecmp(c->argv[2]->ptr,"takeover")) { - takeover = 1; - force = 1; /* Takeover also implies force. */ - } else { - addReplyErrorObject(c,shared.syntaxerr); - return; - } - } + /* No '{' ? Hash the whole key. This is the base case. */ + if (s == keylen) return crc16(key,keylen) & 0x3FFF; - /* Check preconditions. */ - if (nodeIsMaster(myself)) { - addReplyError(c,"You should send CLUSTER FAILOVER to a replica"); - return; - } else if (myself->slaveof == NULL) { - addReplyError(c,"I'm a replica but my master is unknown to me"); - return; - } else if (!force && - (nodeFailed(myself->slaveof) || - myself->slaveof->link == NULL)) - { - addReplyError(c,"Master is down or failed, " - "please use CLUSTER FAILOVER FORCE"); - return; - } - resetManualFailover(); - server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT; - - if (takeover) { - /* A takeover does not perform any initial check. It just - * generates a new configuration epoch for this node without - * consensus, claims the master's slots, and broadcast the new - * configuration. */ - serverLog(LL_NOTICE,"Taking over the master (user request)."); - clusterBumpConfigEpochWithoutConsensus(); - clusterFailoverReplaceYourMaster(); - } else if (force) { - /* If this is a forced failover, we don't need to talk with our - * master to agree about the offset. We just failover taking over - * it without coordination. */ - serverLog(LL_NOTICE,"Forced failover user request accepted."); - server.cluster->mf_can_start = 1; - } else { - serverLog(LL_NOTICE,"Manual failover user request accepted."); - clusterSendMFStart(myself->slaveof); - } - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"set-config-epoch") && c->argc == 3) - { - /* CLUSTER SET-CONFIG-EPOCH - * - * The user is allowed to set the config epoch only when a node is - * totally fresh: no config epoch, no other known node, and so forth. - * This happens at cluster creation time to start with a cluster where - * every node has a different node ID, without to rely on the conflicts - * resolution system which is too slow when a big cluster is created. */ - long long epoch; - - if (getLongLongFromObjectOrReply(c,c->argv[2],&epoch,NULL) != C_OK) - return; + /* '{' found? Check if we have the corresponding '}'. */ + for (e = s+1; e < keylen; e++) + if (key[e] == '}') break; - if (epoch < 0) { - addReplyErrorFormat(c,"Invalid config epoch specified: %lld",epoch); - } else if (dictSize(server.cluster->nodes) > 1) { - addReplyError(c,"The user can assign a config epoch only when the " - "node does not know any other node."); - } else if (myself->configEpoch != 0) { - addReplyError(c,"Node config epoch is already non-zero"); - } else { - myself->configEpoch = epoch; - serverLog(LL_NOTICE, - "configEpoch set to %llu via CLUSTER SET-CONFIG-EPOCH", - (unsigned long long) myself->configEpoch); - - if (server.cluster->currentEpoch < (uint64_t)epoch) - server.cluster->currentEpoch = epoch; - /* No need to fsync the config here since in the unlucky event - * of a failure to persist the config, the conflict resolution code - * will assign a unique config to this node. */ - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| - CLUSTER_TODO_SAVE_CONFIG); - addReply(c,shared.ok); - } - } else if (!strcasecmp(c->argv[1]->ptr,"reset") && - (c->argc == 2 || c->argc == 3)) - { - /* CLUSTER RESET [SOFT|HARD] */ - int hard = 0; - - /* Parse soft/hard argument. Default is soft. */ - if (c->argc == 3) { - if (!strcasecmp(c->argv[2]->ptr,"hard")) { - hard = 1; - } else if (!strcasecmp(c->argv[2]->ptr,"soft")) { - hard = 0; - } else { - addReplyErrorObject(c,shared.syntaxerr); - return; - } - } + /* No '}' or nothing between {} ? Hash the whole key. */ + if (e == keylen || e == s+1) return crc16(key,keylen) & 0x3FFF; - /* Slaves can be reset while containing data, but not master nodes - * that must be empty. */ - if (nodeIsMaster(myself) && dictSize(c->db->dict) != 0) { - addReplyError(c,"CLUSTER RESET can't be called with " - "master nodes containing keys"); - return; - } - clusterReset(hard); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"links") && c->argc == 2) { - /* CLUSTER LINKS */ - addReplyClusterLinksDescription(c); - } else { - addReplySubcommandSyntaxError(c); - return; - } + /* If we are here there is both a { and a } on its right. Hash + * what is in the middle between { and }. */ + return crc16(key+s+1,e-s-1) & 0x3FFF; } -void removeChannelsInSlot(unsigned int slot) { - unsigned int channelcount = countChannelsInSlot(slot); - if (channelcount == 0) return; - - /* Retrieve all the channels for the slot. */ - robj **channels = zmalloc(sizeof(robj*)*channelcount); - raxIterator iter; - int j = 0; - unsigned char indexed[2]; - - indexed[0] = (slot >> 8) & 0xff; - indexed[1] = slot & 0xff; - raxStart(&iter,server.cluster->slots_to_channels); - raxSeek(&iter,">=",indexed,2); - while(raxNext(&iter)) { - if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; - channels[j++] = createStringObject((char*)iter.key + 2, iter.key_len - 2); +/* If it can be inferred that the given glob-style pattern, as implemented in + * stringmatchlen() in util.c, only can match keys belonging to a single slot, + * that slot is returned. Otherwise -1 is returned. */ +int patternHashSlot(char *pattern, int length) { + int s = -1; /* index of the first '{' */ + + for (int i = 0; i < length; i++) { + if (pattern[i] == '*' || pattern[i] == '?' || pattern[i] == '[') { + /* Wildcard or character class found. Keys can be in any slot. */ + return -1; + } else if (pattern[i] == '\\') { + /* Escaped character. Computing slot in this case is not + * implemented. We would need a temp buffer. */ + return -1; + } else if (s == -1 && pattern[i] == '{') { + /* Opening brace '{' found. */ + s = i; + } else if (s >= 0 && pattern[i] == '}' && i == s + 1) { + /* Empty tag '{}' found. The whole key is hashed. Ignore braces. */ + s = -2; + } else if (s >= 0 && pattern[i] == '}') { + /* Non-empty tag '{...}' found. Hash what's between braces. */ + return crc16(pattern + s + 1, i - s - 1) & 0x3FFF; + } + } + + /* The pattern matches a single key. Hash the whole pattern. */ + return crc16(pattern, length) & 0x3FFF; +} + +ConnectionType *connTypeOfCluster(void) { + if (server.tls_cluster) { + return connectionTypeTls(); } - raxStop(&iter); - pubsubUnsubscribeShardChannels(channels, channelcount); - zfree(channels); + return connectionTypeTcp(); } /* ----------------------------------------------------------------------------- @@ -6580,7 +192,7 @@ void restoreCommand(client *c) { lfu_freq == -1) { if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lru_idle,NULL) - != C_OK) return; + != C_OK) return; if (lru_idle < 0) { addReplyError(c,"Invalid IDLETIME value, must be >= 0"); return; @@ -6591,7 +203,7 @@ void restoreCommand(client *c) { lru_idle == -1) { if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lfu_freq,NULL) - != C_OK) return; + != C_OK) return; if (lfu_freq < 0 || lfu_freq > 255) { addReplyError(c,"Invalid FREQ value, must be >= 0 and <= 255"); return; @@ -6653,7 +265,16 @@ void restoreCommand(client *c) { } /* Create the key and set the TTL if any */ - dbAdd(c->db,key,obj); + dictEntry *de = dbAdd(c->db,key,obj); + + /* If minExpiredField was set, then the object is hash with expiration + * on fields and need to register it in global HFE DS */ + if (obj->type == OBJ_HASH) { + uint64_t minExpiredField = hashTypeGetMinExpire(obj, 1); + if (minExpiredField != EB_EXPIRE_TIME_INVALID) + hashTypeAddToExpires(c->db, dictGetKey(de), obj, minExpiredField); + } + if (ttl) { setExpire(c,c->db,key,ttl); if (!absttl) { @@ -6670,7 +291,6 @@ void restoreCommand(client *c) { addReply(c,shared.ok); server.dirty++; } - /* MIGRATE socket cache implementation. * * We take a map between host:ip and a TCP socket that we used to connect @@ -6726,7 +346,7 @@ migrateCachedSocket* migrateGetSocket(client *c, robj *host, robj *port, long ti /* Create the connection */ conn = connCreate(connTypeOfCluster()); if (connBlockingConnect(conn, host->ptr, atoi(port->ptr), timeout) - != C_OK) { + != C_OK) { addReplyError(c,"-IOERR error or timeout connecting to the client"); connClose(conn); sdsfree(name); @@ -6833,8 +453,8 @@ void migrateCommand(client *c) { } else if (!strcasecmp(c->argv[j]->ptr,"keys")) { if (sdslen(c->argv[3]->ptr) != 0) { addReplyError(c, - "When using MIGRATE KEYS option, the key argument" - " must be set to the empty string"); + "When using MIGRATE KEYS option, the key argument" + " must be set to the empty string"); return; } first_key = j+1; @@ -6876,7 +496,7 @@ void migrateCommand(client *c) { return; } -try_again: + try_again: write_error = 0; /* Connect */ @@ -6895,10 +515,10 @@ void migrateCommand(client *c) { serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"AUTH",4)); if (username) { serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,username, - sdslen(username))); + sdslen(username))); } serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,password, - sdslen(password))); + sdslen(password))); } /* Send the SELECT command if the current DB is not already selected. */ @@ -6934,24 +554,24 @@ void migrateCommand(client *c) { kv[non_expired++] = kv[j]; serverAssertWithInfo(c,NULL, - rioWriteBulkCount(&cmd,'*',replace ? 5 : 4)); + rioWriteBulkCount(&cmd,'*',replace ? 5 : 4)); if (server.cluster_enabled) serverAssertWithInfo(c,NULL, - rioWriteBulkString(&cmd,"RESTORE-ASKING",14)); + rioWriteBulkString(&cmd,"RESTORE-ASKING",14)); else serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",7)); serverAssertWithInfo(c,NULL,sdsEncodedObject(kv[j])); serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,kv[j]->ptr, - sdslen(kv[j]->ptr))); + sdslen(kv[j]->ptr))); serverAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,ttl)); /* Emit the payload argument, that is the serialized object using * the DUMP format. */ createDumpPayload(&payload,ov[j],kv[j],dbid); serverAssertWithInfo(c,NULL, - rioWriteBulkString(&cmd,payload.io.buffer.ptr, - sdslen(payload.io.buffer.ptr))); + rioWriteBulkString(&cmd,payload.io.buffer.ptr, + sdslen(payload.io.buffer.ptr))); sdsfree(payload.io.buffer.ptr); /* Add the REPLACE option to the RESTORE command if it was specified @@ -7023,7 +643,7 @@ void migrateCommand(client *c) { error_from_target = 1; addReplyErrorFormat(c,"Target instance replied with error: %s", - errbuf+1); + errbuf+1); } } else { if (!copy) { @@ -7099,7 +719,7 @@ void migrateCommand(client *c) { /* On socket errors we try to close the cached socket and try again. * It is very common for the cached socket to get closed, if just reopening * it works it's a shame to notify the error to the caller. */ -socket_err: + socket_err: /* Cleanup we want to perform in both the retry and no retry case. * Note: Closing the migrate socket will also force SELECT next time. */ sdsfree(cmd.io.buffer.ptr); @@ -7112,58 +732,220 @@ void migrateCommand(client *c) { zfree(newargv); newargv = NULL; /* This will get reallocated on retry. */ - /* Retry only if it's not a timeout and we never attempted a retry - * (or the code jumping here did not set may_retry to zero). */ - if (errno != ETIMEDOUT && may_retry) { - may_retry = 0; - goto try_again; + /* Retry only if it's not a timeout and we never attempted a retry + * (or the code jumping here did not set may_retry to zero). */ + if (errno != ETIMEDOUT && may_retry) { + may_retry = 0; + goto try_again; + } + + /* Cleanup we want to do if no retry is attempted. */ + zfree(ov); zfree(kv); + addReplyErrorSds(c, sdscatprintf(sdsempty(), + "-IOERR error or timeout %s to target instance", + write_error ? "writing" : "reading")); + return; +} + +/* Cluster node sanity check. Returns C_OK if the node id + * is valid an C_ERR otherwise. */ +int verifyClusterNodeId(const char *name, int length) { + if (length != CLUSTER_NAMELEN) return C_ERR; + for (int i = 0; i < length; i++) { + if (name[i] >= 'a' && name[i] <= 'z') continue; + if (name[i] >= '0' && name[i] <= '9') continue; + return C_ERR; + } + return C_OK; +} + +int isValidAuxChar(int c) { + return isalnum(c) || (strchr("!#$%&()*+:;<>?@[]^{|}~", c) == NULL); +} + +int isValidAuxString(char *s, unsigned int length) { + for (unsigned i = 0; i < length; i++) { + if (!isValidAuxChar(s[i])) return 0; } + return 1; +} - /* Cleanup we want to do if no retry is attempted. */ - zfree(ov); zfree(kv); - addReplyErrorSds(c, sdscatprintf(sdsempty(), - "-IOERR error or timeout %s to target instance", - write_error ? "writing" : "reading")); - return; +void clusterCommandMyId(client *c) { + char *name = clusterNodeGetName(getMyClusterNode()); + if (name) { + addReplyBulkCBuffer(c,name, CLUSTER_NAMELEN); + } else { + addReplyError(c, "No ID yet"); + } } -/* ----------------------------------------------------------------------------- - * Cluster functions related to serving / redirecting clients - * -------------------------------------------------------------------------- */ +char* getMyClusterId(void) { + return clusterNodeGetName(getMyClusterNode()); +} -/* The ASKING command is required after a -ASK redirection. - * The client should issue ASKING before to actually send the command to - * the target instance. See the Redis Cluster specification for more - * information. */ -void askingCommand(client *c) { - if (server.cluster_enabled == 0) { - addReplyError(c,"This instance has cluster support disabled"); - return; +void clusterCommandMyShardId(client *c) { + char *sid = clusterNodeGetShardId(getMyClusterNode()); + if (sid) { + addReplyBulkCBuffer(c,sid, CLUSTER_NAMELEN); + } else { + addReplyError(c, "No shard ID yet"); } - c->flags |= CLIENT_ASKING; - addReply(c,shared.ok); } -/* The READONLY command is used by clients to enter the read-only mode. - * In this mode slaves will not redirect clients as long as clients access - * with read-only commands to keys that are served by the slave's master. */ -void readonlyCommand(client *c) { +/* When a cluster command is called, we need to decide whether to return TLS info or + * non-TLS info by the client's connection type. However if the command is called by + * a Lua script or RM_call, there is no connection in the fake client, so we use + * server.current_client here to get the real client if available. And if it is not + * available (modules may call commands without a real client), we return the default + * info, which is determined by server.tls_cluster. */ +static int shouldReturnTlsInfo(void) { + if (server.current_client && server.current_client->conn) { + return connIsTLS(server.current_client->conn); + } else { + return server.tls_cluster; + } +} + +unsigned int countKeysInSlot(unsigned int slot) { + return kvstoreDictSize(server.db->keys, slot); +} + +void clusterCommandHelp(client *c) { + const char *help[] = { + "COUNTKEYSINSLOT ", + " Return the number of keys in .", + "GETKEYSINSLOT ", + " Return key names stored by current node in a slot.", + "INFO", + " Return information about the cluster.", + "KEYSLOT ", + " Return the hash slot for .", + "MYID", + " Return the node id.", + "MYSHARDID", + " Return the node's shard id.", + "NODES", + " Return cluster configuration seen by node. Output format:", + " ...", + "REPLICAS ", + " Return replicas.", + "SLOTS", + " Return information about slots range mappings. Each range is made of:", + " start, end, master and replicas IP addresses, ports and ids", + "SHARDS", + " Return information about slot range mappings and the nodes associated with them.", + NULL + }; + + addExtendedReplyHelp(c, help, clusterCommandExtendedHelp()); +} + +void clusterCommand(client *c) { if (server.cluster_enabled == 0) { addReplyError(c,"This instance has cluster support disabled"); return; } - c->flags |= CLIENT_READONLY; - addReply(c,shared.ok); -} -/* The READWRITE command just clears the READONLY command state. */ -void readwriteCommand(client *c) { - if (server.cluster_enabled == 0) { - addReplyError(c,"This instance has cluster support disabled"); + if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) { + clusterCommandHelp(c); + } else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) { + /* CLUSTER NODES */ + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + sds nodes = clusterGenNodesDescription(c, 0, shouldReturnTlsInfo()); + addReplyVerbatim(c,nodes,sdslen(nodes),"txt"); + sdsfree(nodes); + } else if (!strcasecmp(c->argv[1]->ptr,"myid") && c->argc == 2) { + /* CLUSTER MYID */ + clusterCommandMyId(c); + } else if (!strcasecmp(c->argv[1]->ptr,"myshardid") && c->argc == 2) { + /* CLUSTER MYSHARDID */ + clusterCommandMyShardId(c); + } else if (!strcasecmp(c->argv[1]->ptr,"slots") && c->argc == 2) { + /* CLUSTER SLOTS */ + clusterCommandSlots(c); + } else if (!strcasecmp(c->argv[1]->ptr,"shards") && c->argc == 2) { + /* CLUSTER SHARDS */ + clusterCommandShards(c); + } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) { + /* CLUSTER INFO */ + + sds info = genClusterInfoString(); + + /* Produce the reply protocol. */ + addReplyVerbatim(c,info,sdslen(info),"txt"); + sdsfree(info); + } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) { + /* CLUSTER KEYSLOT */ + sds key = c->argv[2]->ptr; + + addReplyLongLong(c,keyHashSlot(key,sdslen(key))); + } else if (!strcasecmp(c->argv[1]->ptr,"countkeysinslot") && c->argc == 3) { + /* CLUSTER COUNTKEYSINSLOT */ + long long slot; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) + return; + if (slot < 0 || slot >= CLUSTER_SLOTS) { + addReplyError(c,"Invalid slot"); + return; + } + addReplyLongLong(c,countKeysInSlot(slot)); + } else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) { + /* CLUSTER GETKEYSINSLOT */ + long long maxkeys, slot; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) + return; + if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) + != C_OK) + return; + if (slot < 0 || slot >= CLUSTER_SLOTS || maxkeys < 0) { + addReplyError(c,"Invalid slot or number of keys"); + return; + } + + unsigned int keys_in_slot = countKeysInSlot(slot); + unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys; + addReplyArrayLen(c,numkeys); + kvstoreDictIterator *kvs_di = NULL; + dictEntry *de = NULL; + kvs_di = kvstoreGetDictIterator(server.db->keys, slot); + for (unsigned int i = 0; i < numkeys; i++) { + de = kvstoreDictIteratorNext(kvs_di); + serverAssert(de != NULL); + sds sdskey = dictGetKey(de); + addReplyBulkCBuffer(c, sdskey, sdslen(sdskey)); + } + kvstoreReleaseDictIterator(kvs_di); + } else if ((!strcasecmp(c->argv[1]->ptr,"slaves") || + !strcasecmp(c->argv[1]->ptr,"replicas")) && c->argc == 3) { + /* CLUSTER SLAVES */ + /* CLUSTER REPLICAS */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + int j; + + /* Lookup the specified node in our table. */ + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return; + } + + if (clusterNodeIsSlave(n)) { + addReplyError(c,"The specified node is not a master"); + return; + } + + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + addReplyArrayLen(c, clusterNodeNumSlaves(n)); + for (j = 0; j < clusterNodeNumSlaves(n); j++) { + sds ni = clusterGenNodeDescription(c, clusterNodeGetSlave(n, j), shouldReturnTlsInfo()); + addReplyBulkCString(c,ni); + sdsfree(ni); + } + } else if(!clusterCommandSpecial(c)) { + addReplySubcommandSyntaxError(c); return; } - c->flags &= ~CLIENT_READONLY; - addReply(c,shared.ok); } /* Return the pointer to the cluster node that is able to serve the command. @@ -7199,13 +981,15 @@ void readwriteCommand(client *c) { * CLUSTER_REDIR_DOWN_STATE and CLUSTER_REDIR_DOWN_RO_STATE if the cluster is * down but the user attempts to execute a command that addresses one or more keys. */ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) { + clusterNode *myself = getMyClusterNode(); clusterNode *n = NULL; robj *firstkey = NULL; int multiple_keys = 0; multiState *ms, _ms; multiCmd mc; int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0, - existing_keys = 0; + existing_keys = 0; + int pubsubshard_included = 0; /* Flag to indicate if a pubsub shard cmd is included. */ /* Allow any key to be set if a module disabled cluster redirections. */ if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) @@ -7237,10 +1021,6 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in mc.cmd = cmd; } - int is_pubsubshard = cmd->proc == ssubscribeCommand || - cmd->proc == sunsubscribeCommand || - cmd->proc == spublishCommand; - /* Check that all the keys are in the same hash slot, and obtain this * slot and the node associated. */ for (i = 0; i < ms->count; i++) { @@ -7253,6 +1033,13 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in margc = ms->commands[i].argc; margv = ms->commands[i].argv; + /* Only valid for sharded pubsub as regular pubsub can operate on any node and bypasses this layer. */ + if (!pubsubshard_included && + doesCommandHaveChannelsWithFlags(mcmd, CMD_CHANNEL_PUBLISH | CMD_CHANNEL_SUBSCRIBE)) + { + pubsubshard_included = 1; + } + getKeysResult result = GETKEYS_RESULT_INIT; numkeys = getKeysFromCommand(mcmd,margv,margc,&result); keyindex = result.keys; @@ -7267,7 +1054,7 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in * and node. */ firstkey = thiskey; slot = thisslot; - n = server.cluster->slots[slot]; + n = getNodeBySlot(slot); /* Error: If a slot is not served, we are in "cluster down" * state. However the state is yet to be updated, so this was @@ -7286,10 +1073,10 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in * error). To do so we set the importing/migrating state and * increment a counter for every missing key. */ if (n == myself && - server.cluster->migrating_slots_to[slot] != NULL) + getMigratingSlotDest(slot) != NULL) { migrating_slot = 1; - } else if (server.cluster->importing_slots_from[slot] != NULL) { + } else if (getImportingSlotSource(slot) != NULL) { importing_slot = 1; } } else { @@ -7300,7 +1087,7 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in getKeysFreeResult(&result); if (error_code) *error_code = CLUSTER_REDIR_CROSS_SLOT; - return NULL; + return NULL; } if (importing_slot && !multiple_keys && !equalStringObjects(firstkey,thiskey)) { /* Flag this request as one with multiple different @@ -7316,7 +1103,7 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in * node until the migration completes with CLUSTER SETSLOT * NODE . */ int flags = LOOKUP_NOTOUCH | LOOKUP_NOSTATS | LOOKUP_NONOTIFY | LOOKUP_NOEXPIRE; - if ((migrating_slot || importing_slot) && !is_pubsubshard) + if ((migrating_slot || importing_slot) && !pubsubshard_included) { if (lookupKeyReadWithFlags(&server.db[0], thiskey, flags) == NULL) missing_keys++; else existing_keys++; @@ -7332,8 +1119,8 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in uint64_t cmd_flags = getCommandFlags(c); /* Cluster is globally down but we got keys? We only serve the request * if it is a read command and when allow_reads_when_down is enabled. */ - if (server.cluster->state != CLUSTER_OK) { - if (is_pubsubshard) { + if (!isClusterHealthy()) { + if (pubsubshard_included) { if (!server.cluster_allow_pubsubshard_when_down) { if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; return NULL; @@ -7372,7 +1159,7 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in return NULL; } else { if (error_code) *error_code = CLUSTER_REDIR_ASK; - return server.cluster->migrating_slots_to[slot]; + return getMigratingSlotDest(slot); } } @@ -7396,15 +1183,15 @@ clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, in * is serving, we can reply without redirection. */ int is_write_command = (cmd_flags & CMD_WRITE) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE)); - if (((c->flags & CLIENT_READONLY) || is_pubsubshard) && + if (((c->flags & CLIENT_READONLY) || pubsubshard_included) && !is_write_command && - nodeIsSlave(myself) && - myself->slaveof == n) + clusterNodeIsSlave(myself) && + clusterNodeGetSlaveof(myself) == n) { return myself; } - /* Base case: just return the right node. However if this node is not + /* Base case: just return the right node. However, if this node is not * myself, set error_code to MOVED since we need to issue a redirection. */ if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED; return n; @@ -7435,11 +1222,11 @@ void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_co error_code == CLUSTER_REDIR_ASK) { /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ - int port = getNodeClientPort(n, connIsTLS(c->conn)); + int port = clusterNodeClientPort(n, shouldReturnTlsInfo()); addReplyErrorSds(c,sdscatprintf(sdsempty(), - "-%s %d %s:%d", - (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED", - hashslot, getPreferredEndpoint(n), port)); + "-%s %d %s:%d", + (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED", + hashslot, clusterNodePreferredEndpoint(n), port)); } else { serverPanic("getNodeByQuery() unknown error."); } @@ -7457,6 +1244,7 @@ void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_co * longer handles, the client is sent a redirection error, and the function * returns 1. Otherwise 0 is returned and no operation is performed. */ int clusterRedirectBlockedClientIfNeeded(client *c) { + clusterNode *myself = getMyClusterNode(); if (c->flags & CLIENT_BLOCKED && (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET || @@ -7470,7 +1258,7 @@ int clusterRedirectBlockedClientIfNeeded(client *c) { * If the cluster is configured to allow reads on cluster down, we * still want to emit this error since a write will be required * to unblock them which may never come. */ - if (server.cluster->state == CLUSTER_FAIL) { + if (!isClusterHealthy()) { clusterRedirectClient(c,NULL,0,CLUSTER_REDIR_DOWN_STATE); return 1; } @@ -7485,13 +1273,13 @@ int clusterRedirectBlockedClientIfNeeded(client *c) { if ((de = dictNext(di)) != NULL) { robj *key = dictGetKey(de); int slot = keyHashSlot((char*)key->ptr, sdslen(key->ptr)); - clusterNode *node = server.cluster->slots[slot]; + clusterNode *node = getNodeBySlot(slot); /* if the client is read-only and attempting to access key that our * replica can handle, allow it. */ if ((c->flags & CLIENT_READONLY) && !(c->lastcmd->flags & CMD_WRITE) && - nodeIsSlave(myself) && myself->slaveof == node) + clusterNodeIsSlave(myself) && clusterNodeGetSlaveof(myself) == node) { node = myself; } @@ -7499,15 +1287,14 @@ int clusterRedirectBlockedClientIfNeeded(client *c) { /* We send an error and unblock the client if: * 1) The slot is unassigned, emitting a cluster down error. * 2) The slot is not handled by this node, nor being imported. */ - if (node != myself && - server.cluster->importing_slots_from[slot] == NULL) + if (node != myself && getImportingSlotSource(slot) == NULL) { if (node == NULL) { clusterRedirectClient(c,NULL,0, - CLUSTER_REDIR_DOWN_UNBOUND); + CLUSTER_REDIR_DOWN_UNBOUND); } else { clusterRedirectClient(c,node,slot, - CLUSTER_REDIR_MOVED); + CLUSTER_REDIR_MOVED); } dictReleaseIterator(di); return 1; @@ -7518,160 +1305,169 @@ int clusterRedirectBlockedClientIfNeeded(client *c) { return 0; } -/* Slot to Key API. This is used by Redis Cluster in order to obtain in - * a fast way a key that belongs to a specified hash slot. This is useful - * while rehashing the cluster and in other conditions when we need to - * understand if we have keys for a given hash slot. */ - -void slotToKeyAddEntry(dictEntry *entry, redisDb *db) { - sds key = dictGetKey(entry); - unsigned int hashslot = keyHashSlot(key, sdslen(key)); - slotToKeys *slot_to_keys = &(*db->slots_to_keys).by_slot[hashslot]; - slot_to_keys->count++; - - /* Insert entry before the first element in the list. */ - dictEntry *first = slot_to_keys->head; - dictEntryNextInSlot(entry) = first; - if (first != NULL) { - serverAssert(dictEntryPrevInSlot(first) == NULL); - dictEntryPrevInSlot(first) = entry; +/* Returns an indication if the replica node is fully available + * and should be listed in CLUSTER SLOTS response. + * Returns 1 for available nodes, 0 for nodes that have + * not finished their initial sync, in failed state, or are + * otherwise considered not available to serve read commands. */ +static int isReplicaAvailable(clusterNode *node) { + if (clusterNodeIsFailing(node)) { + return 0; + } + long long repl_offset = clusterNodeReplOffset(node); + if (clusterNodeIsMyself(node)) { + /* Nodes do not update their own information + * in the cluster node list. */ + repl_offset = replicationGetSlaveOffset(); } - serverAssert(dictEntryPrevInSlot(entry) == NULL); - slot_to_keys->head = entry; + return (repl_offset != 0); } -void slotToKeyDelEntry(dictEntry *entry, redisDb *db) { - sds key = dictGetKey(entry); - unsigned int hashslot = keyHashSlot(key, sdslen(key)); - slotToKeys *slot_to_keys = &(*db->slots_to_keys).by_slot[hashslot]; - slot_to_keys->count--; - - /* Connect previous and next entries to each other. */ - dictEntry *next = dictEntryNextInSlot(entry); - dictEntry *prev = dictEntryPrevInSlot(entry); - if (next != NULL) { - dictEntryPrevInSlot(next) = prev; - } - if (prev != NULL) { - dictEntryNextInSlot(prev) = next; +void addNodeToNodeReply(client *c, clusterNode *node) { + char* hostname = clusterNodeHostname(node); + addReplyArrayLen(c, 4); + if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_IP) { + addReplyBulkCString(c, clusterNodeIp(node)); + } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_HOSTNAME) { + if (hostname != NULL && hostname[0] != '\0') { + addReplyBulkCString(c, hostname); + } else { + addReplyBulkCString(c, "?"); + } + } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT) { + addReplyNull(c); } else { - /* The removed entry was the first in the list. */ - serverAssert(slot_to_keys->head == entry); - slot_to_keys->head = next; + serverPanic("Unrecognized preferred endpoint type"); } -} -/* Updates neighbour entries when an entry has been replaced (e.g. reallocated - * during active defrag). */ -void slotToKeyReplaceEntry(dict *d, dictEntry *entry) { - dictEntry *next = dictEntryNextInSlot(entry); - dictEntry *prev = dictEntryPrevInSlot(entry); - if (next != NULL) { - dictEntryPrevInSlot(next) = entry; + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + addReplyLongLong(c, clusterNodeClientPort(node, shouldReturnTlsInfo())); + addReplyBulkCBuffer(c, clusterNodeGetName(node), CLUSTER_NAMELEN); + + /* Add the additional endpoint information, this is all the known networking information + * that is not the preferred endpoint. Note the logic is evaluated twice so we can + * correctly report the number of additional network arguments without using a deferred + * map, an assertion is made at the end to check we set the right length. */ + int length = 0; + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { + length++; } - if (prev != NULL) { - dictEntryNextInSlot(prev) = entry; - } else { - /* The replaced entry was the first in the list. */ - sds key = dictGetKey(entry); - unsigned int hashslot = keyHashSlot(key, sdslen(key)); - clusterDictMetadata *dictmeta = dictMetadata(d); - redisDb *db = dictmeta->db; - slotToKeys *slot_to_keys = &(*db->slots_to_keys).by_slot[hashslot]; - slot_to_keys->head = entry; + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME + && hostname != NULL && hostname[0] != '\0') + { + length++; } -} + addReplyMapLen(c, length); -/* Initialize slots-keys map of given db. */ -void slotToKeyInit(redisDb *db) { - db->slots_to_keys = zcalloc(sizeof(clusterSlotToKeyMapping)); - clusterDictMetadata *dictmeta = dictMetadata(db->dict); - dictmeta->db = db; + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { + addReplyBulkCString(c, "ip"); + addReplyBulkCString(c, clusterNodeIp(node)); + length--; + } + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME + && hostname != NULL && hostname[0] != '\0') + { + addReplyBulkCString(c, "hostname"); + addReplyBulkCString(c, hostname); + length--; + } + serverAssert(length == 0); } -/* Empty slots-keys map of given db. */ -void slotToKeyFlush(redisDb *db) { - memset(db->slots_to_keys, 0, - sizeof(clusterSlotToKeyMapping)); -} +void addNodeReplyForClusterSlot(client *c, clusterNode *node, int start_slot, int end_slot) { + int i, nested_elements = 3; /* slots (2) + master addr (1) */ + for (i = 0; i < clusterNodeNumSlaves(node); i++) { + if (!isReplicaAvailable(clusterNodeGetSlave(node, i))) continue; + nested_elements++; + } + addReplyArrayLen(c, nested_elements); + addReplyLongLong(c, start_slot); + addReplyLongLong(c, end_slot); + addNodeToNodeReply(c, node); -/* Free slots-keys map of given db. */ -void slotToKeyDestroy(redisDb *db) { - zfree(db->slots_to_keys); - db->slots_to_keys = NULL; + /* Remaining nodes in reply are replicas for slot range */ + for (i = 0; i < clusterNodeNumSlaves(node); i++) { + /* This loop is copy/pasted from clusterGenNodeDescription() + * with modifications for per-slot node aggregation. */ + if (!isReplicaAvailable(clusterNodeGetSlave(node, i))) continue; + addNodeToNodeReply(c, clusterNodeGetSlave(node, i)); + nested_elements--; + } + serverAssert(nested_elements == 3); /* Original 3 elements */ } -/* Remove all the keys in the specified hash slot. - * The number of removed items is returned. */ -unsigned int delKeysInSlot(unsigned int hashslot) { - unsigned int j = 0; - - dictEntry *de = (*server.db->slots_to_keys).by_slot[hashslot].head; - while (de != NULL) { - sds sdskey = dictGetKey(de); - de = dictEntryNextInSlot(de); - robj *key = createStringObject(sdskey, sdslen(sdskey)); - dbDelete(&server.db[0], key); - propagateDeletion(&server.db[0], key, server.lazyfree_lazy_server_del); - signalModifiedKey(NULL, &server.db[0], key); - moduleNotifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, server.db[0].id); - postExecutionUnitOperations(); - decrRefCount(key); - j++; - server.dirty++; - } +void clusterCommandSlots(client * c) { + /* Format: 1) 1) start slot + * 2) end slot + * 3) 1) master IP + * 2) master port + * 3) node ID + * 4) 1) replica IP + * 2) replica port + * 3) node ID + * ... continued until done + */ + clusterNode *n = NULL; + int num_masters = 0, start = -1; + void *slot_replylen = addReplyDeferredLen(c); - return j; -} + for (int i = 0; i <= CLUSTER_SLOTS; i++) { + /* Find start node and slot id. */ + if (n == NULL) { + if (i == CLUSTER_SLOTS) break; + n = getNodeBySlot(i); + start = i; + continue; + } -unsigned int countKeysInSlot(unsigned int hashslot) { - return (*server.db->slots_to_keys).by_slot[hashslot].count; + /* Add cluster slots info when occur different node with start + * or end of slot. */ + if (i == CLUSTER_SLOTS || n != getNodeBySlot(i)) { + addNodeReplyForClusterSlot(c, n, start, i-1); + num_masters++; + if (i == CLUSTER_SLOTS) break; + n = getNodeBySlot(i); + start = i; + } + } + setDeferredArrayLen(c, slot_replylen, num_masters); } /* ----------------------------------------------------------------------------- - * Operation(s) on channel rax tree. + * Cluster functions related to serving / redirecting clients * -------------------------------------------------------------------------- */ -void slotToChannelUpdate(sds channel, int add) { - size_t keylen = sdslen(channel); - unsigned int hashslot = keyHashSlot(channel,keylen); - unsigned char buf[64]; - unsigned char *indexed = buf; - - if (keylen+2 > 64) indexed = zmalloc(keylen+2); - indexed[0] = (hashslot >> 8) & 0xff; - indexed[1] = hashslot & 0xff; - memcpy(indexed+2,channel,keylen); - if (add) { - raxInsert(server.cluster->slots_to_channels,indexed,keylen+2,NULL,NULL); - } else { - raxRemove(server.cluster->slots_to_channels,indexed,keylen+2,NULL); +/* The ASKING command is required after a -ASK redirection. + * The client should issue ASKING before to actually send the command to + * the target instance. See the Redis Cluster specification for more + * information. */ +void askingCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; } - if (indexed != buf) zfree(indexed); -} - -void slotToChannelAdd(sds channel) { - slotToChannelUpdate(channel,1); + c->flags |= CLIENT_ASKING; + addReply(c,shared.ok); } -void slotToChannelDel(sds channel) { - slotToChannelUpdate(channel,0); +/* The READONLY command is used by clients to enter the read-only mode. + * In this mode slaves will not redirect clients as long as clients access + * with read-only commands to keys that are served by the slave's master. */ +void readonlyCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + c->flags |= CLIENT_READONLY; + addReply(c,shared.ok); } -/* Get the count of the channels for a given slot. */ -unsigned int countChannelsInSlot(unsigned int hashslot) { - raxIterator iter; - int j = 0; - unsigned char indexed[2]; - - indexed[0] = (hashslot >> 8) & 0xff; - indexed[1] = hashslot & 0xff; - raxStart(&iter,server.cluster->slots_to_channels); - raxSeek(&iter,">=",indexed,2); - while(raxNext(&iter)) { - if (iter.key[0] != indexed[0] || iter.key[1] != indexed[1]) break; - j++; +/* The READWRITE command just clears the READONLY command state. */ +void readwriteCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; } - raxStop(&iter); - return j; + c->flags &= ~CLIENT_READONLY; + addReply(c,shared.ok); } diff --git a/src/cluster.h b/src/cluster.h index 21c9c4499db..f21f1e9c16e 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -2,22 +2,15 @@ #define __CLUSTER_H /*----------------------------------------------------------------------------- - * Redis cluster data structures, defines, exported API. + * Redis cluster exported API. *----------------------------------------------------------------------------*/ -#define CLUSTER_SLOTS 16384 +#define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */ +#define CLUSTER_SLOTS (1<flags & CLUSTER_NODE_MASTER) -#define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE) -#define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) -#define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) -#define nodeWithoutAddr(n) ((n)->flags & CLUSTER_NODE_NOADDR) -#define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL) -#define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) -#define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) - -/* Reasons why a slave is not able to failover. */ -#define CLUSTER_CANT_FAILOVER_NONE 0 -#define CLUSTER_CANT_FAILOVER_DATA_AGE 1 -#define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2 -#define CLUSTER_CANT_FAILOVER_EXPIRED 3 -#define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4 -#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */ - -/* clusterState todo_before_sleep flags. */ -#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) -#define CLUSTER_TODO_UPDATE_STATE (1<<1) -#define CLUSTER_TODO_SAVE_CONFIG (1<<2) -#define CLUSTER_TODO_FSYNC_CONFIG (1<<3) -#define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4) - -/* Message types. - * - * Note that the PING, PONG and MEET messages are actually the same exact - * kind of packet. PONG is the reply to ping, in the exact format as a PING, - * while MEET is a special PING that forces the receiver to add the sender - * as a node (if it is not already in the list). */ -#define CLUSTERMSG_TYPE_PING 0 /* Ping */ -#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ -#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ -#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ -#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ -#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ -#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */ -#define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */ -#define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ -#define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */ -#define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */ -#define CLUSTERMSG_TYPE_COUNT 11 /* Total number of message types. */ +typedef struct _clusterNode clusterNode; +struct clusterState; /* Flags that a module can set in order to prevent certain Redis Cluster * features to be enabled. Useful when implementing a different distributed @@ -108,339 +32,87 @@ typedef struct clusterLink { #define CLUSTER_MODULE_FLAG_NO_FAILOVER (1<<1) #define CLUSTER_MODULE_FLAG_NO_REDIRECTION (1<<2) -/* This structure represent elements of node->fail_reports. */ -typedef struct clusterNodeFailReport { - struct clusterNode *node; /* Node reporting the failure condition. */ - mstime_t time; /* Time of the last report from this node. */ -} clusterNodeFailReport; - -typedef struct clusterNode { - mstime_t ctime; /* Node object creation time. */ - char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ - char shard_id[CLUSTER_NAMELEN]; /* shard id, hex string, sha1-size */ - int flags; /* CLUSTER_NODE_... */ - uint64_t configEpoch; /* Last configEpoch observed for this node */ - unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */ - uint16_t *slot_info_pairs; /* Slots info represented as (start/end) pair (consecutive index). */ - int slot_info_pairs_count; /* Used number of slots in slot_info_pairs */ - int numslots; /* Number of slots handled by this node */ - int numslaves; /* Number of slave nodes, if this is a master */ - struct clusterNode **slaves; /* pointers to slave nodes */ - struct clusterNode *slaveof; /* pointer to the master node. Note that it - may be NULL even if the node is a slave - if we don't have the master node in our - tables. */ - unsigned long long last_in_ping_gossip; /* The number of the last carried in the ping gossip section */ - mstime_t ping_sent; /* Unix time we sent latest ping */ - mstime_t pong_received; /* Unix time we received the pong */ - mstime_t data_received; /* Unix time we received any data */ - mstime_t fail_time; /* Unix time when FAIL flag was set */ - mstime_t voted_time; /* Last time we voted for a slave of this master */ - mstime_t repl_offset_time; /* Unix time we received offset for this node */ - mstime_t orphaned_time; /* Starting time of orphaned master condition */ - long long repl_offset; /* Last known repl offset for this node. */ - char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ - sds hostname; /* The known hostname for this node */ - sds human_nodename; /* The known human readable nodename for this node */ - int tcp_port; /* Latest known clients TCP port. */ - int tls_port; /* Latest known clients TLS port */ - int cport; /* Latest known cluster port of this node. */ - clusterLink *link; /* TCP/IP link established toward this node */ - clusterLink *inbound_link; /* TCP/IP link accepted from this node */ - list *fail_reports; /* List of nodes signaling this as failing */ -} clusterNode; - -/* Slot to keys for a single slot. The keys in the same slot are linked together - * using dictEntry metadata. */ -typedef struct slotToKeys { - uint64_t count; /* Number of keys in the slot. */ - dictEntry *head; /* The first key-value entry in the slot. */ -} slotToKeys; - -/* Slot to keys mapping for all slots, opaque outside this file. */ -struct clusterSlotToKeyMapping { - slotToKeys by_slot[CLUSTER_SLOTS]; -}; - -/* Dict entry metadata for cluster mode, used for the Slot to Key API to form a - * linked list of the entries belonging to the same slot. */ -typedef struct clusterDictEntryMetadata { - dictEntry *prev; /* Prev entry with key in the same slot */ - dictEntry *next; /* Next entry with key in the same slot */ -} clusterDictEntryMetadata; - -typedef struct { - redisDb *db; /* A link back to the db this dict belongs to */ -} clusterDictMetadata; - -typedef struct clusterState { - clusterNode *myself; /* This node */ - uint64_t currentEpoch; - int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ - int size; /* Num of master nodes with at least one slot */ - dict *nodes; /* Hash table of name -> clusterNode structures */ - dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ - dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */ - clusterNode *migrating_slots_to[CLUSTER_SLOTS]; - clusterNode *importing_slots_from[CLUSTER_SLOTS]; - clusterNode *slots[CLUSTER_SLOTS]; - rax *slots_to_channels; - /* The following fields are used to take the slave state on elections. */ - mstime_t failover_auth_time; /* Time of previous or next election. */ - int failover_auth_count; /* Number of votes received so far. */ - int failover_auth_sent; /* True if we already asked for votes. */ - int failover_auth_rank; /* This slave rank for current auth request. */ - uint64_t failover_auth_epoch; /* Epoch of the current election. */ - int cant_failover_reason; /* Why a slave is currently not able to - failover. See the CANT_FAILOVER_* macros. */ - /* Manual failover state in common. */ - mstime_t mf_end; /* Manual failover time limit (ms unixtime). - It is zero if there is no MF in progress. */ - /* Manual failover state of master. */ - clusterNode *mf_slave; /* Slave performing the manual failover. */ - /* Manual failover state of slave. */ - long long mf_master_offset; /* Master offset the slave needs to start MF - or -1 if still not received. */ - int mf_can_start; /* If non-zero signal that the manual failover - can start requesting masters vote. */ - /* The following fields are used by masters to take state on elections. */ - uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */ - int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ - /* Stats */ - /* Messages received and sent by type. */ - long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT]; - long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT]; - long long stats_pfail_nodes; /* Number of nodes in PFAIL status, - excluding nodes without address. */ - unsigned long long stat_cluster_links_buffer_limit_exceeded; /* Total number of cluster links freed due to exceeding buffer limit */ - - /* Bit map for slots that are no longer claimed by the owner in cluster PING - * messages. During slot migration, the owner will stop claiming the slot after - * the ownership transfer. Set the bit corresponding to the slot when a node - * stops claiming the slot. This prevents spreading incorrect information (that - * source still owns the slot) using UPDATE messages. */ - unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8]; -} clusterState; - -/* Redis cluster messages header */ - -/* Initially we don't know our "name", but we'll find it once we connect - * to the first node, using the getsockname() function. Then we'll use this - * address for all the next messages. */ -typedef struct { - char nodename[CLUSTER_NAMELEN]; - uint32_t ping_sent; - uint32_t pong_received; - char ip[NET_IP_STR_LEN]; /* IP address last time it was seen */ - uint16_t port; /* primary port last time it was seen */ - uint16_t cport; /* cluster port last time it was seen */ - uint16_t flags; /* node->flags copy */ - uint16_t pport; /* secondary port last time it was seen */ - uint16_t notused1; -} clusterMsgDataGossip; - -typedef struct { - char nodename[CLUSTER_NAMELEN]; -} clusterMsgDataFail; - -typedef struct { - uint32_t channel_len; - uint32_t message_len; - unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */ -} clusterMsgDataPublish; - -typedef struct { - uint64_t configEpoch; /* Config epoch of the specified instance. */ - char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */ - unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */ -} clusterMsgDataUpdate; - -typedef struct { - uint64_t module_id; /* ID of the sender module. */ - uint32_t len; /* ID of the sender module. */ - uint8_t type; /* Type from 0 to 255. */ - unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */ -} clusterMsgModule; - -/* The cluster supports optional extension messages that can be sent - * along with ping/pong/meet messages to give additional info in a - * consistent manner. */ -typedef enum { - CLUSTERMSG_EXT_TYPE_HOSTNAME, - CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, - CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, - CLUSTERMSG_EXT_TYPE_SHARDID, -} clusterMsgPingtypes; - -/* Helper function for making sure extensions are eight byte aligned. */ -#define EIGHT_BYTE_ALIGN(size) ((((size) + 7) / 8) * 8) - -typedef struct { - char hostname[1]; /* The announced hostname, ends with \0. */ -} clusterMsgPingExtHostname; - -typedef struct { - char human_nodename[1]; /* The announced nodename, ends with \0. */ -} clusterMsgPingExtHumanNodename; - -typedef struct { - char name[CLUSTER_NAMELEN]; /* Node name. */ - uint64_t ttl; /* Remaining time to blacklist the node, in seconds. */ -} clusterMsgPingExtForgottenNode; - -static_assert(sizeof(clusterMsgPingExtForgottenNode) % 8 == 0, ""); - -typedef struct { - char shard_id[CLUSTER_NAMELEN]; /* The shard_id, 40 bytes fixed. */ -} clusterMsgPingExtShardId; - -typedef struct { - uint32_t length; /* Total length of this extension message (including this header) */ - uint16_t type; /* Type of this extension message (see clusterMsgPingExtTypes) */ - uint16_t unused; /* 16 bits of padding to make this structure 8 byte aligned. */ - union { - clusterMsgPingExtHostname hostname; - clusterMsgPingExtHumanNodename human_nodename; - clusterMsgPingExtForgottenNode forgotten_node; - clusterMsgPingExtShardId shard_id; - } ext[]; /* Actual extension information, formatted so that the data is 8 - * byte aligned, regardless of its content. */ -} clusterMsgPingExt; - -union clusterMsgData { - /* PING, MEET and PONG */ - struct { - /* Array of N clusterMsgDataGossip structures */ - clusterMsgDataGossip gossip[1]; - /* Extension data that can optionally be sent for ping/meet/pong - * messages. We can't explicitly define them here though, since - * the gossip array isn't the real length of the gossip data. */ - } ping; - - /* FAIL */ - struct { - clusterMsgDataFail about; - } fail; - - /* PUBLISH */ - struct { - clusterMsgDataPublish msg; - } publish; - - /* UPDATE */ - struct { - clusterMsgDataUpdate nodecfg; - } update; - - /* MODULE */ - struct { - clusterMsgModule msg; - } module; -}; - -#define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */ - -typedef struct { - char sig[4]; /* Signature "RCmb" (Redis Cluster message bus). */ - uint32_t totlen; /* Total length of this message */ - uint16_t ver; /* Protocol version, currently set to 1. */ - uint16_t port; /* Primary port number (TCP or TLS). */ - uint16_t type; /* Message type */ - uint16_t count; /* Only used for some kind of messages. */ - uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ - uint64_t configEpoch; /* The config epoch if it's a master, or the last - epoch advertised by its master if it is a - slave. */ - uint64_t offset; /* Master replication offset if node is a master or - processed replication offset if node is a slave. */ - char sender[CLUSTER_NAMELEN]; /* Name of the sender node */ - unsigned char myslots[CLUSTER_SLOTS/8]; - char slaveof[CLUSTER_NAMELEN]; - char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */ - uint16_t extensions; /* Number of extensions sent along with this packet. */ - char notused1[30]; /* 30 bytes reserved for future usage. */ - uint16_t pport; /* Secondary port number: if primary port is TCP port, this is - TLS port, and if primary port is TLS port, this is TCP port.*/ - uint16_t cport; /* Sender TCP cluster bus port */ - uint16_t flags; /* Sender node flags */ - unsigned char state; /* Cluster state from the POV of the sender */ - unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */ - union clusterMsgData data; -} clusterMsg; - -/* clusterMsg defines the gossip wire protocol exchanged among Redis cluster - * members, which can be running different versions of redis-server bits, - * especially during cluster rolling upgrades. - * - * Therefore, fields in this struct should remain at the same offset from - * release to release. The static asserts below ensures that incompatible - * changes in clusterMsg be caught at compile time. - */ - -static_assert(offsetof(clusterMsg, sig) == 0, "unexpected field offset"); -static_assert(offsetof(clusterMsg, totlen) == 4, "unexpected field offset"); -static_assert(offsetof(clusterMsg, ver) == 8, "unexpected field offset"); -static_assert(offsetof(clusterMsg, port) == 10, "unexpected field offset"); -static_assert(offsetof(clusterMsg, type) == 12, "unexpected field offset"); -static_assert(offsetof(clusterMsg, count) == 14, "unexpected field offset"); -static_assert(offsetof(clusterMsg, currentEpoch) == 16, "unexpected field offset"); -static_assert(offsetof(clusterMsg, configEpoch) == 24, "unexpected field offset"); -static_assert(offsetof(clusterMsg, offset) == 32, "unexpected field offset"); -static_assert(offsetof(clusterMsg, sender) == 40, "unexpected field offset"); -static_assert(offsetof(clusterMsg, myslots) == 80, "unexpected field offset"); -static_assert(offsetof(clusterMsg, slaveof) == 2128, "unexpected field offset"); -static_assert(offsetof(clusterMsg, myip) == 2168, "unexpected field offset"); -static_assert(offsetof(clusterMsg, extensions) == 2214, "unexpected field offset"); -static_assert(offsetof(clusterMsg, notused1) == 2216, "unexpected field offset"); -static_assert(offsetof(clusterMsg, pport) == 2246, "unexpected field offset"); -static_assert(offsetof(clusterMsg, cport) == 2248, "unexpected field offset"); -static_assert(offsetof(clusterMsg, flags) == 2250, "unexpected field offset"); -static_assert(offsetof(clusterMsg, state) == 2252, "unexpected field offset"); -static_assert(offsetof(clusterMsg, mflags) == 2253, "unexpected field offset"); -static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset"); - -#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) - -/* Message flags better specify the packet content or are used to - * provide some information about the node state. */ -#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */ -#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if - master is up. */ -#define CLUSTERMSG_FLAG0_EXT_DATA (1<<2) /* Message contains extension data */ - /* ---------------------- API exported outside cluster.c -------------------- */ +/* functions requiring mechanism specific implementations */ void clusterInit(void); -void clusterInitListeners(void); +void clusterInitLast(void); void clusterCron(void); void clusterBeforeSleep(void); -clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); -int verifyClusterNodeId(const char *name, int length); -clusterNode *clusterLookupNode(const char *name, int length); -int clusterRedirectBlockedClientIfNeeded(client *c); -void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code); -void migrateCloseTimedoutSockets(void); int verifyClusterConfigWithData(void); -unsigned long getClusterConnectionsCount(void); + int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len); -void clusterPropagatePublish(robj *channel, robj *message, int sharded); -unsigned int keyHashSlot(char *key, int keylen); -void slotToKeyAddEntry(dictEntry *entry, redisDb *db); -void slotToKeyDelEntry(dictEntry *entry, redisDb *db); -void slotToKeyReplaceEntry(dict *d, dictEntry *entry); -void slotToKeyInit(redisDb *db); -void slotToKeyFlush(redisDb *db); -void slotToKeyDestroy(redisDb *db); + void clusterUpdateMyselfFlags(void); void clusterUpdateMyselfIp(void); -void slotToChannelAdd(sds channel); -void slotToChannelDel(sds channel); void clusterUpdateMyselfHostname(void); void clusterUpdateMyselfAnnouncedPorts(void); +void clusterUpdateMyselfHumanNodename(void); + +void clusterPropagatePublish(robj *channel, robj *message, int sharded); + +unsigned long getClusterConnectionsCount(void); +int isClusterHealthy(void); + sds clusterGenNodesDescription(client *c, int filter, int tls_primary); sds genClusterInfoString(void); -void freeClusterLink(clusterLink *link); -void clusterUpdateMyselfHumanNodename(void); -int isValidAuxString(char *s, unsigned int length); +/* handle implementation specific debug cluster commands. Return 1 if handled, 0 otherwise. */ +int handleDebugClusterCommand(client *c); +const char **clusterDebugCommandExtendedHelp(void); +/* handle implementation specific cluster commands. Return 1 if handled, 0 otherwise. */ +int clusterCommandSpecial(client *c); +const char** clusterCommandExtendedHelp(void); + +int clusterAllowFailoverCmd(client *c); +void clusterPromoteSelfToMaster(void); +int clusterManualFailoverTimeLimit(void); + +void clusterCommandSlots(client * c); +void clusterCommandMyId(client *c); +void clusterCommandMyShardId(client *c); +void clusterCommandShards(client *c); +sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary); + +int clusterNodeCoversSlot(clusterNode *n, int slot); int getNodeDefaultClientPort(clusterNode *n); +int clusterNodeIsMyself(clusterNode *n); +clusterNode *getMyClusterNode(void); +char *getMyClusterId(void); +int getClusterSize(void); +int getMyShardSlotCount(void); +int handleDebugClusterCommand(client *c); +int clusterNodePending(clusterNode *node); +int clusterNodeIsMaster(clusterNode *n); +char **getClusterNodesList(size_t *numnodes); +int clusterNodeIsMaster(clusterNode *n); +char *clusterNodeIp(clusterNode *node); +int clusterNodeIsSlave(clusterNode *node); +clusterNode *clusterNodeGetSlaveof(clusterNode *node); +clusterNode *clusterNodeGetMaster(clusterNode *node); +char *clusterNodeGetName(clusterNode *node); +int clusterNodeTimedOut(clusterNode *node); +int clusterNodeIsFailing(clusterNode *node); +int clusterNodeIsNoFailover(clusterNode *node); +char *clusterNodeGetShardId(clusterNode *node); +int clusterNodeNumSlaves(clusterNode *node); +clusterNode *clusterNodeGetSlave(clusterNode *node, int slave_idx); +clusterNode *getMigratingSlotDest(int slot); +clusterNode *getImportingSlotSource(int slot); +clusterNode *getNodeBySlot(int slot); +int clusterNodeClientPort(clusterNode *n, int use_tls); +char *clusterNodeHostname(clusterNode *node); +const char *clusterNodePreferredEndpoint(clusterNode *n); +long long clusterNodeReplOffset(clusterNode *node); +clusterNode *clusterLookupNode(const char *name, int length); +/* functions with shared implementations */ +clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); +int clusterRedirectBlockedClientIfNeeded(client *c); +void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code); +void migrateCloseTimedoutSockets(void); +unsigned int keyHashSlot(char *key, int keylen); +int patternHashSlot(char *pattern, int length); +int isValidAuxString(char *s, unsigned int length); +void migrateCommand(client *c); +void clusterCommand(client *c); +ConnectionType *connTypeOfCluster(void); #endif /* __CLUSTER_H */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c new file mode 100644 index 00000000000..658b4f3b03b --- /dev/null +++ b/src/cluster_legacy.c @@ -0,0 +1,6498 @@ +/* + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). + */ + +/* + * cluster_legacy.c contains the implementation of the cluster API that is + * specific to the standard, Redis cluster-bus based clustering mechanism. + */ + +#include "server.h" +#include "cluster.h" +#include "cluster_legacy.h" +#include "endianconv.h" +#include "connection.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +/* A global reference to myself is handy to make code more clear. + * Myself always points to server.cluster->myself, that is, the clusterNode + * that represents this node. */ +clusterNode *myself = NULL; + +clusterNode *createClusterNode(char *nodename, int flags); +void clusterAddNode(clusterNode *node); +void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); +void clusterReadHandler(connection *conn); +void clusterSendPing(clusterLink *link, int type); +void clusterSendFail(char *nodename); +void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request); +void clusterUpdateState(void); +int clusterNodeCoversSlot(clusterNode *n, int slot); +list *clusterGetNodesInMyShard(clusterNode *node); +int clusterNodeAddSlave(clusterNode *master, clusterNode *slave); +int clusterAddSlot(clusterNode *n, int slot); +int clusterDelSlot(int slot); +int clusterMoveNodeSlots(clusterNode *from_node, clusterNode *to_node); +int clusterDelNodeSlots(clusterNode *node); +int clusterNodeSetSlotBit(clusterNode *n, int slot); +void clusterSetMaster(clusterNode *n); +void clusterHandleSlaveFailover(void); +void clusterHandleSlaveMigration(int max_slaves); +int bitmapTestBit(unsigned char *bitmap, int pos); +void bitmapSetBit(unsigned char *bitmap, int pos); +void bitmapClearBit(unsigned char *bitmap, int pos); +void clusterDoBeforeSleep(int flags); +void clusterSendUpdate(clusterLink *link, clusterNode *node); +void resetManualFailover(void); +void clusterCloseAllSlots(void); +void clusterSetNodeAsMaster(clusterNode *n); +void clusterDelNode(clusterNode *delnode); +sds representClusterNodeFlags(sds ci, uint16_t flags); +sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count); +void clusterFreeNodesSlotsInfo(clusterNode *n); +uint64_t clusterGetMaxEpoch(void); +int clusterBumpConfigEpochWithoutConsensus(void); +void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len); +const char *clusterGetMessageTypeString(int type); +void removeChannelsInSlot(unsigned int slot); +unsigned int countKeysInSlot(unsigned int hashslot); +unsigned int countChannelsInSlot(unsigned int hashslot); +unsigned int delKeysInSlot(unsigned int hashslot); +void clusterAddNodeToShard(const char *shard_id, clusterNode *node); +list *clusterLookupNodeListByShardId(const char *shard_id); +void clusterRemoveNodeFromShard(clusterNode *node); +int auxShardIdSetter(clusterNode *n, void *value, int length); +sds auxShardIdGetter(clusterNode *n, sds s); +int auxShardIdPresent(clusterNode *n); +int auxHumanNodenameSetter(clusterNode *n, void *value, int length); +sds auxHumanNodenameGetter(clusterNode *n, sds s); +int auxHumanNodenamePresent(clusterNode *n); +int auxTcpPortSetter(clusterNode *n, void *value, int length); +sds auxTcpPortGetter(clusterNode *n, sds s); +int auxTcpPortPresent(clusterNode *n); +int auxTlsPortSetter(clusterNode *n, void *value, int length); +sds auxTlsPortGetter(clusterNode *n, sds s); +int auxTlsPortPresent(clusterNode *n); +static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen); +void freeClusterLink(clusterLink *link); +int verifyClusterNodeId(const char *name, int length); + +int getNodeDefaultClientPort(clusterNode *n) { + return server.tls_cluster ? n->tls_port : n->tcp_port; +} + +static inline int getNodeDefaultReplicationPort(clusterNode *n) { + return server.tls_replication ? n->tls_port : n->tcp_port; +} + +int clusterNodeClientPort(clusterNode *n, int use_tls) { + return use_tls ? n->tls_port : n->tcp_port; +} + +static inline int defaultClientPort(void) { + return server.tls_cluster ? server.tls_port : server.port; +} + +#define isSlotUnclaimed(slot) \ + (server.cluster->slots[slot] == NULL || \ + bitmapTestBit(server.cluster->owner_not_claiming_slot, slot)) + +#define RCVBUF_INIT_LEN 1024 +#define RCVBUF_MAX_PREALLOC (1<<20) /* 1MB */ + +/* Cluster nodes hash table, mapping nodes addresses 1.2.3.4:6379 to + * clusterNode structures. */ +dictType clusterNodesDictType = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCompare, /* key compare */ + dictSdsDestructor, /* key destructor */ + NULL, /* val destructor */ + NULL /* allow to expand */ +}; + +/* Cluster re-addition blacklist. This maps node IDs to the time + * we can re-add this node. The goal is to avoid reading a removed + * node for some time. */ +dictType clusterNodesBlackListDictType = { + dictSdsCaseHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCaseCompare, /* key compare */ + dictSdsDestructor, /* key destructor */ + NULL, /* val destructor */ + NULL /* allow to expand */ +}; + +/* Cluster shards hash table, mapping shard id to list of nodes */ +dictType clusterSdsToListType = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCompare, /* key compare */ + dictSdsDestructor, /* key destructor */ + dictListDestructor, /* val destructor */ + NULL /* allow to expand */ +}; + +/* Aux fields are introduced in Redis 7.2 to support the persistence + * of various important node properties, such as shard id, in nodes.conf. + * Aux fields take an explicit format of name=value pairs and have no + * intrinsic order among them. Aux fields are always grouped together + * at the end of the second column of each row after the node's IP + * address/port/cluster_port and the optional hostname. Aux fields + * are separated by ','. */ + +/* Aux field setter function prototype + * return C_OK when the update is successful; C_ERR otherwise */ +typedef int (aux_value_setter) (clusterNode* n, void *value, int length); +/* Aux field getter function prototype + * return an sds that is a concatenation of the input sds string and + * the aux value */ +typedef sds (aux_value_getter) (clusterNode* n, sds s); + +typedef int (aux_value_present) (clusterNode* n); + +typedef struct { + char *field; + aux_value_setter *setter; + aux_value_getter *getter; + aux_value_present *isPresent; +} auxFieldHandler; + +/* Assign index to each aux field */ +typedef enum { + af_shard_id, + af_human_nodename, + af_tcp_port, + af_tls_port, + af_count, +} auxFieldIndex; + +/* Note that + * 1. the order of the elements below must match that of their + * indices as defined in auxFieldIndex + * 2. aux name can contain characters that pass the isValidAuxChar check only */ +auxFieldHandler auxFieldHandlers[] = { + {"shard-id", auxShardIdSetter, auxShardIdGetter, auxShardIdPresent}, + {"nodename", auxHumanNodenameSetter, auxHumanNodenameGetter, auxHumanNodenamePresent}, + {"tcp-port", auxTcpPortSetter, auxTcpPortGetter, auxTcpPortPresent}, + {"tls-port", auxTlsPortSetter, auxTlsPortGetter, auxTlsPortPresent}, +}; + +int auxShardIdSetter(clusterNode *n, void *value, int length) { + if (verifyClusterNodeId(value, length) == C_ERR) { + return C_ERR; + } + memcpy(n->shard_id, value, CLUSTER_NAMELEN); + /* if n already has replicas, make sure they all agree + * on the shard id */ + for (int i = 0; i < n->numslaves; i++) { + if (memcmp(n->slaves[i]->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) { + return C_ERR; + } + } + clusterAddNodeToShard(value, n); + return C_OK; +} + +sds auxShardIdGetter(clusterNode *n, sds s) { + return sdscatprintf(s, "%.40s", n->shard_id); +} + +int auxShardIdPresent(clusterNode *n) { + return strlen(n->shard_id); +} + +int auxHumanNodenameSetter(clusterNode *n, void *value, int length) { + if (n && !strncmp(value, n->human_nodename, length)) { + return C_OK; + } else if (!n && (length == 0)) { + return C_OK; + } + if (n) { + n->human_nodename = sdscpylen(n->human_nodename, value, length); + } else if (sdslen(n->human_nodename) != 0) { + sdsclear(n->human_nodename); + } else { + return C_ERR; + } + return C_OK; +} + +sds auxHumanNodenameGetter(clusterNode *n, sds s) { + return sdscatprintf(s, "%s", n->human_nodename); +} + +int auxHumanNodenamePresent(clusterNode *n) { + return sdslen(n->human_nodename); +} + +int auxTcpPortSetter(clusterNode *n, void *value, int length) { + if (length > 5 || length < 1) { + return C_ERR; + } + char buf[length + 1]; + memcpy(buf, (char*)value, length); + buf[length] = '\0'; + n->tcp_port = atoi(buf); + return (n->tcp_port < 0 || n->tcp_port >= 65536) ? C_ERR : C_OK; +} + +sds auxTcpPortGetter(clusterNode *n, sds s) { + return sdscatprintf(s, "%d", n->tcp_port); +} + +int auxTcpPortPresent(clusterNode *n) { + return n->tcp_port >= 0 && n->tcp_port < 65536; +} + +int auxTlsPortSetter(clusterNode *n, void *value, int length) { + if (length > 5 || length < 1) { + return C_ERR; + } + char buf[length + 1]; + memcpy(buf, (char*)value, length); + buf[length] = '\0'; + n->tls_port = atoi(buf); + return (n->tls_port < 0 || n->tls_port >= 65536) ? C_ERR : C_OK; +} + +sds auxTlsPortGetter(clusterNode *n, sds s) { + return sdscatprintf(s, "%d", n->tls_port); +} + +int auxTlsPortPresent(clusterNode *n) { + return n->tls_port >= 0 && n->tls_port < 65536; +} + +/* clusterLink send queue blocks */ +typedef struct { + size_t totlen; /* Total length of this block including the message */ + int refcount; /* Number of cluster link send msg queues containing the message */ + clusterMsg msg; +} clusterMsgSendBlock; + +/* ----------------------------------------------------------------------------- + * Initialization + * -------------------------------------------------------------------------- */ + +/* Load the cluster config from 'filename'. + * + * If the file does not exist or is zero-length (this may happen because + * when we lock the nodes.conf file, we create a zero-length one for the + * sake of locking if it does not already exist), C_ERR is returned. + * If the configuration was loaded from the file, C_OK is returned. */ +int clusterLoadConfig(char *filename) { + FILE *fp = fopen(filename,"r"); + struct stat sb; + char *line; + int maxline, j; + + if (fp == NULL) { + if (errno == ENOENT) { + return C_ERR; + } else { + serverLog(LL_WARNING, + "Loading the cluster node config from %s: %s", + filename, strerror(errno)); + exit(1); + } + } + + if (redis_fstat(fileno(fp),&sb) == -1) { + serverLog(LL_WARNING, + "Unable to obtain the cluster node config file stat %s: %s", + filename, strerror(errno)); + exit(1); + } + /* Check if the file is zero-length: if so return C_ERR to signal + * we have to write the config. */ + if (sb.st_size == 0) { + fclose(fp); + return C_ERR; + } + + /* Parse the file. Note that single lines of the cluster config file can + * be really long as they include all the hash slots of the node. + * This means in the worst possible case, half of the Redis slots will be + * present in a single line, possibly in importing or migrating state, so + * together with the node ID of the sender/receiver. + * + * To simplify we allocate 1024+CLUSTER_SLOTS*128 bytes per line. */ + maxline = 1024+CLUSTER_SLOTS*128; + line = zmalloc(maxline); + while(fgets(line,maxline,fp) != NULL) { + int argc, aux_argc; + sds *argv, *aux_argv; + clusterNode *n, *master; + char *p, *s; + + /* Skip blank lines, they can be created either by users manually + * editing nodes.conf or by the config writing process if stopped + * before the truncate() call. */ + if (line[0] == '\n' || line[0] == '\0') continue; + + /* Split the line into arguments for processing. */ + argv = sdssplitargs(line,&argc); + if (argv == NULL) goto fmterr; + + /* Handle the special "vars" line. Don't pretend it is the last + * line even if it actually is when generated by Redis. */ + if (strcasecmp(argv[0],"vars") == 0) { + if (!(argc % 2)) goto fmterr; + for (j = 1; j < argc; j += 2) { + if (strcasecmp(argv[j],"currentEpoch") == 0) { + server.cluster->currentEpoch = + strtoull(argv[j+1],NULL,10); + } else if (strcasecmp(argv[j],"lastVoteEpoch") == 0) { + server.cluster->lastVoteEpoch = + strtoull(argv[j+1],NULL,10); + } else { + serverLog(LL_NOTICE, + "Skipping unknown cluster config variable '%s'", + argv[j]); + } + } + sdsfreesplitres(argv,argc); + continue; + } + + /* Regular config lines have at least eight fields */ + if (argc < 8) { + sdsfreesplitres(argv,argc); + goto fmterr; + } + + /* Create this node if it does not exist */ + if (verifyClusterNodeId(argv[0], sdslen(argv[0])) == C_ERR) { + sdsfreesplitres(argv, argc); + goto fmterr; + } + n = clusterLookupNode(argv[0], sdslen(argv[0])); + if (!n) { + n = createClusterNode(argv[0],0); + clusterAddNode(n); + } + /* Format for the node address and auxiliary argument information: + * ip:port[@cport][,hostname][,aux=val]*] */ + + aux_argv = sdssplitlen(argv[1], sdslen(argv[1]), ",", 1, &aux_argc); + if (aux_argv == NULL) { + sdsfreesplitres(argv,argc); + goto fmterr; + } + + /* Hostname is an optional argument that defines the endpoint + * that can be reported to clients instead of IP. */ + if (aux_argc > 1 && sdslen(aux_argv[1]) > 0) { + n->hostname = sdscpy(n->hostname, aux_argv[1]); + } else if (sdslen(n->hostname) != 0) { + sdsclear(n->hostname); + } + + /* All fields after hostname are auxiliary and they take on + * the format of "aux=val" where both aux and val can contain + * characters that pass the isValidAuxChar check only. The order + * of the aux fields is insignificant. */ + int aux_tcp_port = 0; + int aux_tls_port = 0; + for (int i = 2; i < aux_argc; i++) { + int field_argc; + sds *field_argv; + field_argv = sdssplitlen(aux_argv[i], sdslen(aux_argv[i]), "=", 1, &field_argc); + if (field_argv == NULL || field_argc != 2) { + /* Invalid aux field format */ + if (field_argv != NULL) sdsfreesplitres(field_argv, field_argc); + sdsfreesplitres(aux_argv, aux_argc); + sdsfreesplitres(argv,argc); + goto fmterr; + } + + /* Validate that both aux and value contain valid characters only */ + for (unsigned j = 0; j < 2; j++) { + if (!isValidAuxString(field_argv[j],sdslen(field_argv[j]))){ + /* Invalid aux field format */ + sdsfreesplitres(field_argv, field_argc); + sdsfreesplitres(aux_argv, aux_argc); + sdsfreesplitres(argv,argc); + goto fmterr; + } + } + + /* Note that we don't expect lots of aux fields in the foreseeable + * future so a linear search is completely fine. */ + int field_found = 0; + for (unsigned j = 0; j < numElements(auxFieldHandlers); j++) { + if (sdslen(field_argv[0]) != strlen(auxFieldHandlers[j].field) || + memcmp(field_argv[0], auxFieldHandlers[j].field, sdslen(field_argv[0])) != 0) { + continue; + } + field_found = 1; + aux_tcp_port |= j == af_tcp_port; + aux_tls_port |= j == af_tls_port; + if (auxFieldHandlers[j].setter(n, field_argv[1], sdslen(field_argv[1])) != C_OK) { + /* Invalid aux field format */ + sdsfreesplitres(field_argv, field_argc); + sdsfreesplitres(aux_argv, aux_argc); + sdsfreesplitres(argv,argc); + goto fmterr; + } + } + + if (field_found == 0) { + /* Invalid aux field format */ + sdsfreesplitres(field_argv, field_argc); + sdsfreesplitres(aux_argv, aux_argc); + sdsfreesplitres(argv,argc); + goto fmterr; + } + + sdsfreesplitres(field_argv, field_argc); + } + /* Address and port */ + if ((p = strrchr(aux_argv[0],':')) == NULL) { + sdsfreesplitres(aux_argv, aux_argc); + sdsfreesplitres(argv,argc); + goto fmterr; + } + *p = '\0'; + memcpy(n->ip,aux_argv[0],strlen(aux_argv[0])+1); + char *port = p+1; + char *busp = strchr(port,'@'); + if (busp) { + *busp = '\0'; + busp++; + } + /* If neither TCP or TLS port is found in aux field, it is considered + * an old version of nodes.conf file.*/ + if (!aux_tcp_port && !aux_tls_port) { + if (server.tls_cluster) { + n->tls_port = atoi(port); + } else { + n->tcp_port = atoi(port); + } + } else if (!aux_tcp_port) { + n->tcp_port = atoi(port); + } else if (!aux_tls_port) { + n->tls_port = atoi(port); + } + /* In older versions of nodes.conf the "@busport" part is missing. + * In this case we set it to the default offset of 10000 from the + * base port. */ + n->cport = busp ? atoi(busp) : (getNodeDefaultClientPort(n) + CLUSTER_PORT_INCR); + + /* The plaintext port for client in a TLS cluster (n->pport) is not + * stored in nodes.conf. It is received later over the bus protocol. */ + + sdsfreesplitres(aux_argv, aux_argc); + + /* Parse flags */ + p = s = argv[2]; + while(p) { + p = strchr(s,','); + if (p) *p = '\0'; + if (!strcasecmp(s,"myself")) { + serverAssert(server.cluster->myself == NULL); + myself = server.cluster->myself = n; + n->flags |= CLUSTER_NODE_MYSELF; + } else if (!strcasecmp(s,"master")) { + n->flags |= CLUSTER_NODE_MASTER; + } else if (!strcasecmp(s,"slave")) { + n->flags |= CLUSTER_NODE_SLAVE; + } else if (!strcasecmp(s,"fail?")) { + n->flags |= CLUSTER_NODE_PFAIL; + } else if (!strcasecmp(s,"fail")) { + n->flags |= CLUSTER_NODE_FAIL; + n->fail_time = mstime(); + } else if (!strcasecmp(s,"handshake")) { + n->flags |= CLUSTER_NODE_HANDSHAKE; + } else if (!strcasecmp(s,"noaddr")) { + n->flags |= CLUSTER_NODE_NOADDR; + } else if (!strcasecmp(s,"nofailover")) { + n->flags |= CLUSTER_NODE_NOFAILOVER; + } else if (!strcasecmp(s,"noflags")) { + /* nothing to do */ + } else { + serverPanic("Unknown flag in redis cluster config file"); + } + if (p) s = p+1; + } + + /* Get master if any. Set the master and populate master's + * slave list. */ + if (argv[3][0] != '-') { + if (verifyClusterNodeId(argv[3], sdslen(argv[3])) == C_ERR) { + sdsfreesplitres(argv, argc); + goto fmterr; + } + master = clusterLookupNode(argv[3], sdslen(argv[3])); + if (!master) { + master = createClusterNode(argv[3],0); + clusterAddNode(master); + } + /* shard_id can be absent if we are loading a nodes.conf generated + * by an older version of Redis; we should follow the primary's + * shard_id in this case */ + if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) { + memcpy(n->shard_id, master->shard_id, CLUSTER_NAMELEN); + clusterAddNodeToShard(master->shard_id, n); + } else if (clusterGetNodesInMyShard(master) != NULL && + memcmp(master->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) + { + /* If the primary has been added to a shard, make sure this + * node has the same persisted shard id as the primary. */ + goto fmterr; + } + n->slaveof = master; + clusterNodeAddSlave(master,n); + } else if (auxFieldHandlers[af_shard_id].isPresent(n) == 0) { + /* n is a primary but it does not have a persisted shard_id. + * This happens if we are loading a nodes.conf generated by + * an older version of Redis. We should manually update the + * shard membership in this case */ + clusterAddNodeToShard(n->shard_id, n); + } + + /* Set ping sent / pong received timestamps */ + if (atoi(argv[4])) n->ping_sent = mstime(); + if (atoi(argv[5])) n->pong_received = mstime(); + + /* Set configEpoch for this node. + * If the node is a replica, set its config epoch to 0. + * If it's a primary, load the config epoch from the configuration file. */ + n->configEpoch = (nodeIsSlave(n) && n->slaveof) ? 0 : strtoull(argv[6],NULL,10); + + /* Populate hash slots served by this instance. */ + for (j = 8; j < argc; j++) { + int start, stop; + + if (argv[j][0] == '[') { + /* Here we handle migrating / importing slots */ + int slot; + char direction; + clusterNode *cn; + + p = strchr(argv[j],'-'); + serverAssert(p != NULL); + *p = '\0'; + direction = p[1]; /* Either '>' or '<' */ + slot = atoi(argv[j]+1); + if (slot < 0 || slot >= CLUSTER_SLOTS) { + sdsfreesplitres(argv,argc); + goto fmterr; + } + p += 3; + + char *pr = strchr(p, ']'); + size_t node_len = pr - p; + if (pr == NULL || verifyClusterNodeId(p, node_len) == C_ERR) { + sdsfreesplitres(argv, argc); + goto fmterr; + } + cn = clusterLookupNode(p, CLUSTER_NAMELEN); + if (!cn) { + cn = createClusterNode(p,0); + clusterAddNode(cn); + } + if (direction == '>') { + server.cluster->migrating_slots_to[slot] = cn; + } else { + server.cluster->importing_slots_from[slot] = cn; + } + continue; + } else if ((p = strchr(argv[j],'-')) != NULL) { + *p = '\0'; + start = atoi(argv[j]); + stop = atoi(p+1); + } else { + start = stop = atoi(argv[j]); + } + if (start < 0 || start >= CLUSTER_SLOTS || + stop < 0 || stop >= CLUSTER_SLOTS) + { + sdsfreesplitres(argv,argc); + goto fmterr; + } + while(start <= stop) clusterAddSlot(n, start++); + } + + sdsfreesplitres(argv,argc); + } + /* Config sanity check */ + if (server.cluster->myself == NULL) goto fmterr; + + zfree(line); + fclose(fp); + + serverLog(LL_NOTICE,"Node configuration loaded, I'm %.40s", myself->name); + + /* Something that should never happen: currentEpoch smaller than + * the max epoch found in the nodes configuration. However we handle this + * as some form of protection against manual editing of critical files. */ + if (clusterGetMaxEpoch() > server.cluster->currentEpoch) { + server.cluster->currentEpoch = clusterGetMaxEpoch(); + } + return C_OK; + +fmterr: + serverLog(LL_WARNING, + "Unrecoverable error: corrupted cluster config file \"%s\".", line); + zfree(line); + if (fp) fclose(fp); + exit(1); +} + +/* Cluster node configuration is exactly the same as CLUSTER NODES output. + * + * This function writes the node config and returns 0, on error -1 + * is returned. + * + * Note: we need to write the file in an atomic way from the point of view + * of the POSIX filesystem semantics, so that if the server is stopped + * or crashes during the write, we'll end with either the old file or the + * new one. Since we have the full payload to write available we can use + * a single write to write the whole file. If the pre-existing file was + * bigger we pad our payload with newlines that are anyway ignored and truncate + * the file afterward. */ +int clusterSaveConfig(int do_fsync) { + sds ci,tmpfilename; + size_t content_size,offset = 0; + ssize_t written_bytes; + int fd = -1; + int retval = C_ERR; + + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG; + + /* Get the nodes description and concatenate our "vars" directive to + * save currentEpoch and lastVoteEpoch. */ + ci = clusterGenNodesDescription(NULL, CLUSTER_NODE_HANDSHAKE, 0); + ci = sdscatprintf(ci,"vars currentEpoch %llu lastVoteEpoch %llu\n", + (unsigned long long) server.cluster->currentEpoch, + (unsigned long long) server.cluster->lastVoteEpoch); + content_size = sdslen(ci); + + /* Create a temp file with the new content. */ + tmpfilename = sdscatfmt(sdsempty(),"%s.tmp-%i-%I", + server.cluster_configfile,(int) getpid(),mstime()); + if ((fd = open(tmpfilename,O_WRONLY|O_CREAT,0644)) == -1) { + serverLog(LL_WARNING,"Could not open temp cluster config file: %s",strerror(errno)); + goto cleanup; + } + + while (offset < content_size) { + written_bytes = write(fd,ci + offset,content_size - offset); + if (written_bytes <= 0) { + if (errno == EINTR) continue; + serverLog(LL_WARNING,"Failed after writing (%zd) bytes to tmp cluster config file: %s", + offset,strerror(errno)); + goto cleanup; + } + offset += written_bytes; + } + + if (do_fsync) { + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG; + if (redis_fsync(fd) == -1) { + serverLog(LL_WARNING,"Could not sync tmp cluster config file: %s",strerror(errno)); + goto cleanup; + } + } + + if (rename(tmpfilename, server.cluster_configfile) == -1) { + serverLog(LL_WARNING,"Could not rename tmp cluster config file: %s",strerror(errno)); + goto cleanup; + } + + if (do_fsync) { + if (fsyncFileDir(server.cluster_configfile) == -1) { + serverLog(LL_WARNING,"Could not sync cluster config file dir: %s",strerror(errno)); + goto cleanup; + } + } + retval = C_OK; /* If we reached this point, everything is fine. */ + +cleanup: + if (fd != -1) close(fd); + if (retval) unlink(tmpfilename); + sdsfree(tmpfilename); + sdsfree(ci); + return retval; +} + +void clusterSaveConfigOrDie(int do_fsync) { + if (clusterSaveConfig(do_fsync) == -1) { + serverLog(LL_WARNING,"Fatal: can't update cluster config file."); + exit(1); + } +} + +/* Lock the cluster config using flock(), and retain the file descriptor used to + * acquire the lock so that the file will be locked as long as the process is up. + * + * This works because we always update nodes.conf with a new version + * in-place, reopening the file, and writing to it in place (later adjusting + * the length with ftruncate()). + * + * On success C_OK is returned, otherwise an error is logged and + * the function returns C_ERR to signal a lock was not acquired. */ +int clusterLockConfig(char *filename) { +/* flock() does not exist on Solaris + * and a fcntl-based solution won't help, as we constantly re-open that file, + * which will release _all_ locks anyway + */ +#if !defined(__sun) + /* To lock it, we need to open the file in a way it is created if + * it does not exist, otherwise there is a race condition with other + * processes. */ + int fd = open(filename,O_WRONLY|O_CREAT|O_CLOEXEC,0644); + if (fd == -1) { + serverLog(LL_WARNING, + "Can't open %s in order to acquire a lock: %s", + filename, strerror(errno)); + return C_ERR; + } + + if (flock(fd,LOCK_EX|LOCK_NB) == -1) { + if (errno == EWOULDBLOCK) { + serverLog(LL_WARNING, + "Sorry, the cluster configuration file %s is already used " + "by a different Redis Cluster node. Please make sure that " + "different nodes use different cluster configuration " + "files.", filename); + } else { + serverLog(LL_WARNING, + "Impossible to lock %s: %s", filename, strerror(errno)); + } + close(fd); + return C_ERR; + } + /* Lock acquired: leak the 'fd' by not closing it until shutdown time, so that + * we'll retain the lock to the file as long as the process exists. + * + * After fork, the child process will get the fd opened by the parent process, + * we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(), + * it will be closed in the child process. + * If it is not closed, when the main process is killed -9, but the child process + * (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the + * child process, and the main process will fail to get lock, means fail to start. */ + server.cluster_config_file_lock_fd = fd; +#else + UNUSED(filename); +#endif /* __sun */ + + return C_OK; +} + +/* Derives our ports to be announced in the cluster bus. */ +void deriveAnnouncedPorts(int *announced_tcp_port, int *announced_tls_port, + int *announced_cport) { + /* Config overriding announced ports. */ + *announced_tcp_port = server.cluster_announce_port ? + server.cluster_announce_port : server.port; + *announced_tls_port = server.cluster_announce_tls_port ? + server.cluster_announce_tls_port : server.tls_port; + /* Derive cluster bus port. */ + if (server.cluster_announce_bus_port) { + *announced_cport = server.cluster_announce_bus_port; + } else if (server.cluster_port) { + *announced_cport = server.cluster_port; + } else { + *announced_cport = defaultClientPort() + CLUSTER_PORT_INCR; + } +} + +/* Some flags (currently just the NOFAILOVER flag) may need to be updated + * in the "myself" node based on the current configuration of the node, + * that may change at runtime via CONFIG SET. This function changes the + * set of flags in myself->flags accordingly. */ +void clusterUpdateMyselfFlags(void) { + if (!myself) return; + int oldflags = myself->flags; + int nofailover = server.cluster_slave_no_failover ? + CLUSTER_NODE_NOFAILOVER : 0; + myself->flags &= ~CLUSTER_NODE_NOFAILOVER; + myself->flags |= nofailover; + if (myself->flags != oldflags) { + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } +} + + +/* We want to take myself->port/cport/pport in sync with the +* cluster-announce-port/cluster-announce-bus-port/cluster-announce-tls-port option. +* The option can be set at runtime via CONFIG SET. */ +void clusterUpdateMyselfAnnouncedPorts(void) { + if (!myself) return; + deriveAnnouncedPorts(&myself->tcp_port,&myself->tls_port,&myself->cport); +} + +/* We want to take myself->ip in sync with the cluster-announce-ip option. +* The option can be set at runtime via CONFIG SET. */ +void clusterUpdateMyselfIp(void) { + if (!myself) return; + static char *prev_ip = NULL; + char *curr_ip = server.cluster_announce_ip; + int changed = 0; + + if (prev_ip == NULL && curr_ip != NULL) changed = 1; + else if (prev_ip != NULL && curr_ip == NULL) changed = 1; + else if (prev_ip && curr_ip && strcmp(prev_ip,curr_ip)) changed = 1; + + if (changed) { + if (prev_ip) zfree(prev_ip); + prev_ip = curr_ip; + + if (curr_ip) { + /* We always take a copy of the previous IP address, by + * duplicating the string. This way later we can check if + * the address really changed. */ + prev_ip = zstrdup(prev_ip); + redis_strlcpy(myself->ip,server.cluster_announce_ip,NET_IP_STR_LEN); + } else { + myself->ip[0] = '\0'; /* Force autodetection. */ + } + } +} + +/* Update the hostname for the specified node with the provided C string. */ +static void updateAnnouncedHostname(clusterNode *node, char *new) { + /* Previous and new hostname are the same, no need to update. */ + if (new && !strcmp(new, node->hostname)) { + return; + } else if (!new && (sdslen(node->hostname) == 0)) { + return; + } + + if (new) { + node->hostname = sdscpy(node->hostname, new); + } else if (sdslen(node->hostname) != 0) { + sdsclear(node->hostname); + } + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); +} + +static void updateAnnouncedHumanNodename(clusterNode *node, char *new) { + if (new && !strcmp(new, node->human_nodename)) { + return; + } else if (!new && (sdslen(node->human_nodename) == 0)) { + return; + } + + if (new) { + node->human_nodename = sdscpy(node->human_nodename, new); + } else if (sdslen(node->human_nodename) != 0) { + sdsclear(node->human_nodename); + } + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); +} + + +static void updateShardId(clusterNode *node, const char *shard_id) { + if (shard_id && memcmp(node->shard_id, shard_id, CLUSTER_NAMELEN) != 0) { + clusterRemoveNodeFromShard(node); + memcpy(node->shard_id, shard_id, CLUSTER_NAMELEN); + clusterAddNodeToShard(shard_id, node); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } + if (shard_id && myself != node && myself->slaveof == node) { + if (memcmp(myself->shard_id, shard_id, CLUSTER_NAMELEN) != 0) { + /* shard-id can diverge right after a rolling upgrade + * from pre-7.2 releases */ + clusterRemoveNodeFromShard(myself); + memcpy(myself->shard_id, shard_id, CLUSTER_NAMELEN); + clusterAddNodeToShard(shard_id, myself); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG); + } + } +} + +/* Update my hostname based on server configuration values */ +void clusterUpdateMyselfHostname(void) { + if (!myself) return; + updateAnnouncedHostname(myself, server.cluster_announce_hostname); +} + +void clusterUpdateMyselfHumanNodename(void) { + if (!myself) return; + updateAnnouncedHumanNodename(myself, server.cluster_announce_human_nodename); +} + +void clusterInit(void) { + int saveconf = 0; + + server.cluster = zmalloc(sizeof(struct clusterState)); + server.cluster->myself = NULL; + server.cluster->currentEpoch = 0; + server.cluster->state = CLUSTER_FAIL; + server.cluster->size = 0; + server.cluster->todo_before_sleep = 0; + server.cluster->nodes = dictCreate(&clusterNodesDictType); + server.cluster->shards = dictCreate(&clusterSdsToListType); + server.cluster->nodes_black_list = + dictCreate(&clusterNodesBlackListDictType); + server.cluster->failover_auth_time = 0; + server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_rank = 0; + server.cluster->failover_auth_epoch = 0; + server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; + server.cluster->lastVoteEpoch = 0; + + /* Initialize stats */ + for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { + server.cluster->stats_bus_messages_sent[i] = 0; + server.cluster->stats_bus_messages_received[i] = 0; + } + server.cluster->stats_pfail_nodes = 0; + server.cluster->stat_cluster_links_buffer_limit_exceeded = 0; + + memset(server.cluster->slots,0, sizeof(server.cluster->slots)); + clusterCloseAllSlots(); + + memset(server.cluster->owner_not_claiming_slot, 0, sizeof(server.cluster->owner_not_claiming_slot)); + + /* Lock the cluster config file to make sure every node uses + * its own nodes.conf. */ + server.cluster_config_file_lock_fd = -1; + if (clusterLockConfig(server.cluster_configfile) == C_ERR) + exit(1); + + /* Load or create a new nodes configuration. */ + if (clusterLoadConfig(server.cluster_configfile) == C_ERR) { + /* No configuration found. We will just use the random name provided + * by the createClusterNode() function. */ + myself = server.cluster->myself = + createClusterNode(NULL,CLUSTER_NODE_MYSELF|CLUSTER_NODE_MASTER); + serverLog(LL_NOTICE,"No cluster configuration found, I'm %.40s", + myself->name); + clusterAddNode(myself); + clusterAddNodeToShard(myself->shard_id, myself); + saveconf = 1; + } + if (saveconf) clusterSaveConfigOrDie(1); + + /* Port sanity check II + * The other handshake port check is triggered too late to stop + * us from trying to use a too-high cluster port number. */ + int port = defaultClientPort(); + if (!server.cluster_port && port > (65535-CLUSTER_PORT_INCR)) { + serverLog(LL_WARNING, "Redis port number too high. " + "Cluster communication port is 10,000 port " + "numbers higher than your Redis port. " + "Your Redis port number must be 55535 or less."); + exit(1); + } + if (!server.bindaddr_count) { + serverLog(LL_WARNING, "No bind address is configured, but it is required for the Cluster bus."); + exit(1); + } + + /* Set myself->port/cport/pport to my listening ports, we'll just need to + * discover the IP address via MEET messages. */ + deriveAnnouncedPorts(&myself->tcp_port, &myself->tls_port, &myself->cport); + + server.cluster->mf_end = 0; + server.cluster->mf_slave = NULL; + resetManualFailover(); + clusterUpdateMyselfFlags(); + clusterUpdateMyselfIp(); + clusterUpdateMyselfHostname(); + clusterUpdateMyselfHumanNodename(); +} + +void clusterInitLast(void) { + if (connectionIndexByType(connTypeOfCluster()->get_type(NULL)) < 0) { + serverLog(LL_WARNING, "Missing connection type %s, but it is required for the Cluster bus.", connTypeOfCluster()->get_type(NULL)); + exit(1); + } + + int port = defaultClientPort(); + connListener *listener = &server.clistener; + listener->count = 0; + listener->bindaddr = server.bindaddr; + listener->bindaddr_count = server.bindaddr_count; + listener->port = server.cluster_port ? server.cluster_port : port + CLUSTER_PORT_INCR; + listener->ct = connTypeOfCluster(); + if (connListen(listener) == C_ERR ) { + /* Note: the following log text is matched by the test suite. */ + serverLog(LL_WARNING, "Failed listening on port %u (cluster), aborting.", listener->port); + exit(1); + } + + if (createSocketAcceptHandler(&server.clistener, clusterAcceptHandler) != C_OK) { + serverPanic("Unrecoverable error creating Redis Cluster socket accept handler."); + } +} + +/* Reset a node performing a soft or hard reset: + * + * 1) All other nodes are forgotten. + * 2) All the assigned / open slots are released. + * 3) If the node is a slave, it turns into a master. + * 4) Only for hard reset: a new Node ID is generated. + * 5) Only for hard reset: currentEpoch and configEpoch are set to 0. + * 6) The new configuration is saved and the cluster state updated. + * 7) If the node was a slave, the whole data set is flushed away. */ +void clusterReset(int hard) { + dictIterator *di; + dictEntry *de; + int j; + + /* Turn into master. */ + if (nodeIsSlave(myself)) { + clusterSetNodeAsMaster(myself); + replicationUnsetMaster(); + emptyData(-1,EMPTYDB_NO_FLAGS,NULL); + } + + /* Close slots, reset manual failover state. */ + clusterCloseAllSlots(); + resetManualFailover(); + + /* Unassign all the slots. */ + for (j = 0; j < CLUSTER_SLOTS; j++) clusterDelSlot(j); + + /* Recreate shards dict */ + dictEmpty(server.cluster->shards, NULL); + + /* Forget all the nodes, but myself. */ + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (node == myself) continue; + clusterDelNode(node); + } + dictReleaseIterator(di); + + /* Empty the nodes blacklist. */ + dictEmpty(server.cluster->nodes_black_list, NULL); + + /* Hard reset only: set epochs to 0, change node ID. */ + if (hard) { + sds oldname; + + server.cluster->currentEpoch = 0; + server.cluster->lastVoteEpoch = 0; + myself->configEpoch = 0; + serverLog(LL_NOTICE, "configEpoch set to 0 via CLUSTER RESET HARD"); + + /* To change the Node ID we need to remove the old name from the + * nodes table, change the ID, and re-add back with new name. */ + oldname = sdsnewlen(myself->name, CLUSTER_NAMELEN); + dictDelete(server.cluster->nodes,oldname); + sdsfree(oldname); + getRandomHexChars(myself->name, CLUSTER_NAMELEN); + getRandomHexChars(myself->shard_id, CLUSTER_NAMELEN); + clusterAddNode(myself); + serverLog(LL_NOTICE,"Node hard reset, now I'm %.40s", myself->name); + } + + /* Re-populate shards */ + clusterAddNodeToShard(myself->shard_id, myself); + + /* Make sure to persist the new config and update the state. */ + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); +} + +/* ----------------------------------------------------------------------------- + * CLUSTER communication link + * -------------------------------------------------------------------------- */ +static clusterMsgSendBlock *createClusterMsgSendBlock(int type, uint32_t msglen) { + uint32_t blocklen = msglen + sizeof(clusterMsgSendBlock) - sizeof(clusterMsg); + clusterMsgSendBlock *msgblock = zcalloc(blocklen); + msgblock->refcount = 1; + msgblock->totlen = blocklen; + server.stat_cluster_links_memory += blocklen; + clusterBuildMessageHdr(&msgblock->msg,type,msglen); + return msgblock; +} + +static void clusterMsgSendBlockDecrRefCount(void *node) { + clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)node; + msgblock->refcount--; + serverAssert(msgblock->refcount >= 0); + if (msgblock->refcount == 0) { + server.stat_cluster_links_memory -= msgblock->totlen; + zfree(msgblock); + } +} + +clusterLink *createClusterLink(clusterNode *node) { + clusterLink *link = zmalloc(sizeof(*link)); + link->ctime = mstime(); + link->send_msg_queue = listCreate(); + listSetFreeMethod(link->send_msg_queue, clusterMsgSendBlockDecrRefCount); + link->head_msg_send_offset = 0; + link->send_msg_queue_mem = sizeof(list); + link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN); + link->rcvbuf_len = 0; + server.stat_cluster_links_memory += link->rcvbuf_alloc + link->send_msg_queue_mem; + link->conn = NULL; + link->node = node; + /* Related node can only possibly be known at link creation time if this is an outbound link */ + link->inbound = (node == NULL); + if (!link->inbound) { + node->link = link; + } + return link; +} + +/* Free a cluster link, but does not free the associated node of course. + * This function will just make sure that the original node associated + * with this link will have the 'link' field set to NULL. */ +void freeClusterLink(clusterLink *link) { + if (link->conn) { + connClose(link->conn); + link->conn = NULL; + } + server.stat_cluster_links_memory -= sizeof(list) + listLength(link->send_msg_queue)*sizeof(listNode); + listRelease(link->send_msg_queue); + server.stat_cluster_links_memory -= link->rcvbuf_alloc; + zfree(link->rcvbuf); + if (link->node) { + if (link->node->link == link) { + serverAssert(!link->inbound); + link->node->link = NULL; + } else if (link->node->inbound_link == link) { + serverAssert(link->inbound); + link->node->inbound_link = NULL; + } + } + zfree(link); +} + +void setClusterNodeToInboundClusterLink(clusterNode *node, clusterLink *link) { + serverAssert(!link->node); + serverAssert(link->inbound); + if (node->inbound_link) { + /* A peer may disconnect and then reconnect with us, and it's not guaranteed that + * we would always process the disconnection of the existing inbound link before + * accepting a new existing inbound link. Therefore, it's possible to have more than + * one inbound link from the same node at the same time. Our cleanup logic assumes + * a one to one relationship between nodes and inbound links, so we need to kill + * one of the links. The existing link is more likely the outdated one, but it's + * possible the other node may need to open another link. */ + serverLog(LL_DEBUG, "Replacing inbound link fd %d from node %.40s with fd %d", + node->inbound_link->conn->fd, node->name, link->conn->fd); + freeClusterLink(node->inbound_link); + } + serverAssert(!node->inbound_link); + node->inbound_link = link; + link->node = node; +} + +static void clusterConnAcceptHandler(connection *conn) { + clusterLink *link; + + if (connGetState(conn) != CONN_STATE_CONNECTED) { + serverLog(LL_VERBOSE, + "Error accepting cluster node connection: %s", connGetLastError(conn)); + connClose(conn); + return; + } + + /* Create a link object we use to handle the connection. + * It gets passed to the readable handler when data is available. + * Initially the link->node pointer is set to NULL as we don't know + * which node is, but the right node is references once we know the + * node identity. */ + link = createClusterLink(NULL); + link->conn = conn; + connSetPrivateData(conn, link); + + /* Register read handler */ + connSetReadHandler(conn, clusterReadHandler); +} + +#define MAX_CLUSTER_ACCEPTS_PER_CALL 1000 +void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { + int cport, cfd; + int max = MAX_CLUSTER_ACCEPTS_PER_CALL; + char cip[NET_IP_STR_LEN]; + int require_auth = TLS_CLIENT_AUTH_YES; + UNUSED(el); + UNUSED(mask); + UNUSED(privdata); + + /* If the server is starting up, don't accept cluster connections: + * UPDATE messages may interact with the database content. */ + if (server.masterhost == NULL && server.loading) return; + + while(max--) { + cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport); + if (cfd == ANET_ERR) { + if (errno != EWOULDBLOCK) + serverLog(LL_VERBOSE, + "Error accepting cluster node: %s", server.neterr); + return; + } + + connection *conn = connCreateAccepted(connTypeOfCluster(), cfd, &require_auth); + + /* Make sure connection is not in an error state */ + if (connGetState(conn) != CONN_STATE_ACCEPTING) { + serverLog(LL_VERBOSE, + "Error creating an accepting connection for cluster node: %s", + connGetLastError(conn)); + connClose(conn); + return; + } + connEnableTcpNoDelay(conn); + connKeepAlive(conn,server.cluster_node_timeout / 1000 * 2); + + /* Use non-blocking I/O for cluster messages. */ + serverLog(LL_VERBOSE,"Accepting cluster node connection from %s:%d", cip, cport); + + /* Accept the connection now. connAccept() may call our handler directly + * or schedule it for later depending on connection implementation. + */ + if (connAccept(conn, clusterConnAcceptHandler) == C_ERR) { + if (connGetState(conn) == CONN_STATE_ERROR) + serverLog(LL_VERBOSE, + "Error accepting cluster node connection: %s", + connGetLastError(conn)); + connClose(conn); + return; + } + } +} + +/* Return the approximated number of sockets we are using in order to + * take the cluster bus connections. */ +unsigned long getClusterConnectionsCount(void) { + /* We decrement the number of nodes by one, since there is the + * "myself" node too in the list. Each node uses two file descriptors, + * one incoming and one outgoing, thus the multiplication by 2. */ + return server.cluster_enabled ? + ((dictSize(server.cluster->nodes)-1)*2) : 0; +} + +/* ----------------------------------------------------------------------------- + * CLUSTER node API + * -------------------------------------------------------------------------- */ + +/* Create a new cluster node, with the specified flags. + * If "nodename" is NULL this is considered a first handshake and a random + * node name is assigned to this node (it will be fixed later when we'll + * receive the first pong). + * + * The node is created and returned to the user, but it is not automatically + * added to the nodes hash table. */ +clusterNode *createClusterNode(char *nodename, int flags) { + clusterNode *node = zmalloc(sizeof(*node)); + + if (nodename) + memcpy(node->name, nodename, CLUSTER_NAMELEN); + else + getRandomHexChars(node->name, CLUSTER_NAMELEN); + getRandomHexChars(node->shard_id, CLUSTER_NAMELEN); + node->ctime = mstime(); + node->configEpoch = 0; + node->flags = flags; + memset(node->slots,0,sizeof(node->slots)); + node->slot_info_pairs = NULL; + node->slot_info_pairs_count = 0; + node->numslots = 0; + node->numslaves = 0; + node->slaves = NULL; + node->slaveof = NULL; + node->last_in_ping_gossip = 0; + node->ping_sent = node->pong_received = 0; + node->data_received = 0; + node->fail_time = 0; + node->link = NULL; + node->inbound_link = NULL; + memset(node->ip,0,sizeof(node->ip)); + node->hostname = sdsempty(); + node->human_nodename = sdsempty(); + node->tcp_port = 0; + node->cport = 0; + node->tls_port = 0; + node->fail_reports = listCreate(); + node->voted_time = 0; + node->orphaned_time = 0; + node->repl_offset_time = 0; + node->repl_offset = 0; + listSetFreeMethod(node->fail_reports,zfree); + return node; +} + +/* This function is called every time we get a failure report from a node. + * The side effect is to populate the fail_reports list (or to update + * the timestamp of an existing report). + * + * 'failing' is the node that is in failure state according to the + * 'sender' node. + * + * The function returns 0 if it just updates a timestamp of an existing + * failure report from the same sender. 1 is returned if a new failure + * report is created. */ +int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) { + list *l = failing->fail_reports; + listNode *ln; + listIter li; + clusterNodeFailReport *fr; + + /* If a failure report from the same sender already exists, just update + * the timestamp. */ + listRewind(l,&li); + while ((ln = listNext(&li)) != NULL) { + fr = ln->value; + if (fr->node == sender) { + fr->time = mstime(); + return 0; + } + } + + /* Otherwise create a new report. */ + fr = zmalloc(sizeof(*fr)); + fr->node = sender; + fr->time = mstime(); + listAddNodeTail(l,fr); + return 1; +} + +/* Remove failure reports that are too old, where too old means reasonably + * older than the global node timeout. Note that anyway for a node to be + * flagged as FAIL we need to have a local PFAIL state that is at least + * older than the global node timeout, so we don't just trust the number + * of failure reports from other nodes. */ +void clusterNodeCleanupFailureReports(clusterNode *node) { + list *l = node->fail_reports; + listNode *ln; + listIter li; + clusterNodeFailReport *fr; + mstime_t maxtime = server.cluster_node_timeout * + CLUSTER_FAIL_REPORT_VALIDITY_MULT; + mstime_t now = mstime(); + + listRewind(l,&li); + while ((ln = listNext(&li)) != NULL) { + fr = ln->value; + if (now - fr->time > maxtime) listDelNode(l,ln); + } +} + +/* Remove the failing report for 'node' if it was previously considered + * failing by 'sender'. This function is called when a node informs us via + * gossip that a node is OK from its point of view (no FAIL or PFAIL flags). + * + * Note that this function is called relatively often as it gets called even + * when there are no nodes failing, and is O(N), however when the cluster is + * fine the failure reports list is empty so the function runs in constant + * time. + * + * The function returns 1 if the failure report was found and removed. + * Otherwise 0 is returned. */ +int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) { + list *l = node->fail_reports; + listNode *ln; + listIter li; + clusterNodeFailReport *fr; + + /* Search for a failure report from this sender. */ + listRewind(l,&li); + while ((ln = listNext(&li)) != NULL) { + fr = ln->value; + if (fr->node == sender) break; + } + if (!ln) return 0; /* No failure report from this sender. */ + + /* Remove the failure report. */ + listDelNode(l,ln); + clusterNodeCleanupFailureReports(node); + return 1; +} + +/* Return the number of external nodes that believe 'node' is failing, + * not including this node, that may have a PFAIL or FAIL state for this + * node as well. */ +int clusterNodeFailureReportsCount(clusterNode *node) { + clusterNodeCleanupFailureReports(node); + return listLength(node->fail_reports); +} + +int clusterNodeRemoveSlave(clusterNode *master, clusterNode *slave) { + int j; + + for (j = 0; j < master->numslaves; j++) { + if (master->slaves[j] == slave) { + if ((j+1) < master->numslaves) { + int remaining_slaves = (master->numslaves - j) - 1; + memmove(master->slaves+j,master->slaves+(j+1), + (sizeof(*master->slaves) * remaining_slaves)); + } + master->numslaves--; + if (master->numslaves == 0) + master->flags &= ~CLUSTER_NODE_MIGRATE_TO; + return C_OK; + } + } + return C_ERR; +} + +int clusterNodeAddSlave(clusterNode *master, clusterNode *slave) { + int j; + + /* If it's already a slave, don't add it again. */ + for (j = 0; j < master->numslaves; j++) + if (master->slaves[j] == slave) return C_ERR; + master->slaves = zrealloc(master->slaves, + sizeof(clusterNode*)*(master->numslaves+1)); + master->slaves[master->numslaves] = slave; + master->numslaves++; + master->flags |= CLUSTER_NODE_MIGRATE_TO; + return C_OK; +} + +int clusterCountNonFailingSlaves(clusterNode *n) { + int j, okslaves = 0; + + for (j = 0; j < n->numslaves; j++) + if (!nodeFailed(n->slaves[j])) okslaves++; + return okslaves; +} + +/* Low level cleanup of the node structure. Only called by clusterDelNode(). */ +void freeClusterNode(clusterNode *n) { + sds nodename; + int j; + + /* If the node has associated slaves, we have to set + * all the slaves->slaveof fields to NULL (unknown). */ + for (j = 0; j < n->numslaves; j++) + n->slaves[j]->slaveof = NULL; + + /* Remove this node from the list of slaves of its master. */ + if (nodeIsSlave(n) && n->slaveof) clusterNodeRemoveSlave(n->slaveof,n); + + /* Unlink from the set of nodes. */ + nodename = sdsnewlen(n->name, CLUSTER_NAMELEN); + serverAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK); + sdsfree(nodename); + sdsfree(n->hostname); + sdsfree(n->human_nodename); + + /* Release links and associated data structures. */ + if (n->link) freeClusterLink(n->link); + if (n->inbound_link) freeClusterLink(n->inbound_link); + listRelease(n->fail_reports); + zfree(n->slaves); + zfree(n); +} + +/* Add a node to the nodes hash table */ +void clusterAddNode(clusterNode *node) { + int retval; + + retval = dictAdd(server.cluster->nodes, + sdsnewlen(node->name,CLUSTER_NAMELEN), node); + serverAssert(retval == DICT_OK); +} + +/* Remove a node from the cluster. The function performs the high level + * cleanup, calling freeClusterNode() for the low level cleanup. + * Here we do the following: + * + * 1) Mark all the slots handled by it as unassigned. + * 2) Remove all the failure reports sent by this node and referenced by + * other nodes. + * 3) Remove the node from the owning shard + * 4) Free the node with freeClusterNode() that will in turn remove it + * from the hash table and from the list of slaves of its master, if + * it is a slave node. + */ +void clusterDelNode(clusterNode *delnode) { + int j; + dictIterator *di; + dictEntry *de; + + /* 1) Mark slots as unassigned. */ + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (server.cluster->importing_slots_from[j] == delnode) + server.cluster->importing_slots_from[j] = NULL; + if (server.cluster->migrating_slots_to[j] == delnode) + server.cluster->migrating_slots_to[j] = NULL; + if (server.cluster->slots[j] == delnode) + clusterDelSlot(j); + } + + /* 2) Remove failure reports. */ + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (node == delnode) continue; + clusterNodeDelFailureReport(node,delnode); + } + dictReleaseIterator(di); + + /* 3) Remove the node from the owning shard */ + clusterRemoveNodeFromShard(delnode); + + /* 4) Free the node, unlinking it from the cluster. */ + freeClusterNode(delnode); +} + +/* Node lookup by name */ +clusterNode *clusterLookupNode(const char *name, int length) { + if (verifyClusterNodeId(name, length) != C_OK) return NULL; + sds s = sdsnewlen(name, length); + dictEntry *de = dictFind(server.cluster->nodes, s); + sdsfree(s); + if (de == NULL) return NULL; + return dictGetVal(de); +} + +/* Get all the nodes in my shard. + * Note that the list returned is not computed on the fly + * via slaveof; rather, it is maintained permanently to + * track the shard membership and its life cycle is tied + * to this Redis process. Therefore, the caller must not + * release the list. */ +list *clusterGetNodesInMyShard(clusterNode *node) { + sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN); + dictEntry *de = dictFind(server.cluster->shards,s); + sdsfree(s); + return (de != NULL) ? dictGetVal(de) : NULL; +} + +/* This is only used after the handshake. When we connect a given IP/PORT + * as a result of CLUSTER MEET we don't have the node name yet, so we + * pick a random one, and will fix it when we receive the PONG request using + * this function. */ +void clusterRenameNode(clusterNode *node, char *newname) { + int retval; + sds s = sdsnewlen(node->name, CLUSTER_NAMELEN); + + serverLog(LL_DEBUG,"Renaming node %.40s into %.40s", + node->name, newname); + retval = dictDelete(server.cluster->nodes, s); + sdsfree(s); + serverAssert(retval == DICT_OK); + memcpy(node->name, newname, CLUSTER_NAMELEN); + clusterAddNode(node); + clusterAddNodeToShard(node->shard_id, node); +} + +void clusterAddNodeToShard(const char *shard_id, clusterNode *node) { + sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); + dictEntry *de = dictFind(server.cluster->shards,s); + if (de == NULL) { + list *l = listCreate(); + listAddNodeTail(l, node); + serverAssert(dictAdd(server.cluster->shards, s, l) == DICT_OK); + } else { + list *l = dictGetVal(de); + if (listSearchKey(l, node) == NULL) { + listAddNodeTail(l, node); + } + sdsfree(s); + } +} + +void clusterRemoveNodeFromShard(clusterNode *node) { + sds s = sdsnewlen(node->shard_id, CLUSTER_NAMELEN); + dictEntry *de = dictFind(server.cluster->shards, s); + if (de != NULL) { + list *l = dictGetVal(de); + listNode *ln = listSearchKey(l, node); + if (ln != NULL) { + listDelNode(l, ln); + } + if (listLength(l) == 0) { + dictDelete(server.cluster->shards, s); + } + } + sdsfree(s); +} + +/* ----------------------------------------------------------------------------- + * CLUSTER config epoch handling + * -------------------------------------------------------------------------- */ + +/* Return the greatest configEpoch found in the cluster, or the current + * epoch if greater than any node configEpoch. */ +uint64_t clusterGetMaxEpoch(void) { + uint64_t max = 0; + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + if (node->configEpoch > max) max = node->configEpoch; + } + dictReleaseIterator(di); + if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch; + return max; +} + +/* If this node epoch is zero or is not already the greatest across the + * cluster (from the POV of the local configuration), this function will: + * + * 1) Generate a new config epoch, incrementing the current epoch. + * 2) Assign the new epoch to this node, WITHOUT any consensus. + * 3) Persist the configuration on disk before sending packets with the + * new configuration. + * + * If the new config epoch is generated and assigned, C_OK is returned, + * otherwise C_ERR is returned (since the node has already the greatest + * configuration around) and no operation is performed. + * + * Important note: this function violates the principle that config epochs + * should be generated with consensus and should be unique across the cluster. + * However Redis Cluster uses this auto-generated new config epochs in two + * cases: + * + * 1) When slots are closed after importing. Otherwise resharding would be + * too expensive. + * 2) When CLUSTER FAILOVER is called with options that force a slave to + * failover its master even if there is not master majority able to + * create a new configuration epoch. + * + * Redis Cluster will not explode using this function, even in the case of + * a collision between this node and another node, generating the same + * configuration epoch unilaterally, because the config epoch conflict + * resolution algorithm will eventually move colliding nodes to different + * config epochs. However using this function may violate the "last failover + * wins" rule, so should only be used with care. */ +int clusterBumpConfigEpochWithoutConsensus(void) { + uint64_t maxEpoch = clusterGetMaxEpoch(); + + if (myself->configEpoch == 0 || + myself->configEpoch != maxEpoch) + { + server.cluster->currentEpoch++; + myself->configEpoch = server.cluster->currentEpoch; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_FSYNC_CONFIG); + serverLog(LL_NOTICE, + "New configEpoch set to %llu", + (unsigned long long) myself->configEpoch); + return C_OK; + } else { + return C_ERR; + } +} + +/* This function is called when this node is a master, and we receive from + * another master a configuration epoch that is equal to our configuration + * epoch. + * + * BACKGROUND + * + * It is not possible that different slaves get the same config + * epoch during a failover election, because the slaves need to get voted + * by a majority. However when we perform a manual resharding of the cluster + * the node will assign a configuration epoch to itself without to ask + * for agreement. Usually resharding happens when the cluster is working well + * and is supervised by the sysadmin, however it is possible for a failover + * to happen exactly while the node we are resharding a slot to assigns itself + * a new configuration epoch, but before it is able to propagate it. + * + * So technically it is possible in this condition that two nodes end with + * the same configuration epoch. + * + * Another possibility is that there are bugs in the implementation causing + * this to happen. + * + * Moreover when a new cluster is created, all the nodes start with the same + * configEpoch. This collision resolution code allows nodes to automatically + * end with a different configEpoch at startup automatically. + * + * In all the cases, we want a mechanism that resolves this issue automatically + * as a safeguard. The same configuration epoch for masters serving different + * set of slots is not harmful, but it is if the nodes end serving the same + * slots for some reason (manual errors or software bugs) without a proper + * failover procedure. + * + * In general we want a system that eventually always ends with different + * masters having different configuration epochs whatever happened, since + * nothing is worse than a split-brain condition in a distributed system. + * + * BEHAVIOR + * + * When this function gets called, what happens is that if this node + * has the lexicographically smaller Node ID compared to the other node + * with the conflicting epoch (the 'sender' node), it will assign itself + * the greatest configuration epoch currently detected among nodes plus 1. + * + * This means that even if there are multiple nodes colliding, the node + * with the greatest Node ID never moves forward, so eventually all the nodes + * end with a different configuration epoch. + */ +void clusterHandleConfigEpochCollision(clusterNode *sender) { + /* Prerequisites: nodes have the same configEpoch and are both masters. */ + if (sender->configEpoch != myself->configEpoch || + !clusterNodeIsMaster(sender) || !clusterNodeIsMaster(myself)) return; + /* Don't act if the colliding node has a smaller Node ID. */ + if (memcmp(sender->name,myself->name,CLUSTER_NAMELEN) <= 0) return; + /* Get the next ID available at the best of this node knowledge. */ + server.cluster->currentEpoch++; + myself->configEpoch = server.cluster->currentEpoch; + clusterSaveConfigOrDie(1); + serverLog(LL_VERBOSE, + "WARNING: configEpoch collision with node %.40s (%s)." + " configEpoch set to %llu", + sender->name,sender->human_nodename, + (unsigned long long) myself->configEpoch); +} + +/* ----------------------------------------------------------------------------- + * CLUSTER nodes blacklist + * + * The nodes blacklist is just a way to ensure that a given node with a given + * Node ID is not re-added before some time elapsed (this time is specified + * in seconds in CLUSTER_BLACKLIST_TTL). + * + * This is useful when we want to remove a node from the cluster completely: + * when CLUSTER FORGET is called, it also puts the node into the blacklist so + * that even if we receive gossip messages from other nodes that still remember + * about the node we want to remove, we don't re-add it before some time. + * + * Currently the CLUSTER_BLACKLIST_TTL is set to 1 minute, this means + * that redis-cli has 60 seconds to send CLUSTER FORGET messages to nodes + * in the cluster without dealing with the problem of other nodes re-adding + * back the node to nodes we already sent the FORGET command to. + * + * The data structure used is a hash table with an sds string representing + * the node ID as key, and the time when it is ok to re-add the node as + * value. + * -------------------------------------------------------------------------- */ + +#define CLUSTER_BLACKLIST_TTL 60 /* 1 minute. */ + + +/* Before of the addNode() or Exists() operations we always remove expired + * entries from the black list. This is an O(N) operation but it is not a + * problem since add / exists operations are called very infrequently and + * the hash table is supposed to contain very little elements at max. + * However without the cleanup during long uptime and with some automated + * node add/removal procedures, entries could accumulate. */ +void clusterBlacklistCleanup(void) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes_black_list); + while((de = dictNext(di)) != NULL) { + int64_t expire = dictGetUnsignedIntegerVal(de); + + if (expire < server.unixtime) + dictDelete(server.cluster->nodes_black_list,dictGetKey(de)); + } + dictReleaseIterator(di); +} + +/* Cleanup the blacklist and add a new node ID to the black list. */ +void clusterBlacklistAddNode(clusterNode *node) { + dictEntry *de; + sds id = sdsnewlen(node->name,CLUSTER_NAMELEN); + + clusterBlacklistCleanup(); + if (dictAdd(server.cluster->nodes_black_list,id,NULL) == DICT_OK) { + /* If the key was added, duplicate the sds string representation of + * the key for the next lookup. We'll free it at the end. */ + id = sdsdup(id); + } + de = dictFind(server.cluster->nodes_black_list,id); + dictSetUnsignedIntegerVal(de,time(NULL)+CLUSTER_BLACKLIST_TTL); + sdsfree(id); +} + +/* Return non-zero if the specified node ID exists in the blacklist. + * You don't need to pass an sds string here, any pointer to 40 bytes + * will work. */ +int clusterBlacklistExists(char *nodeid) { + sds id = sdsnewlen(nodeid,CLUSTER_NAMELEN); + int retval; + + clusterBlacklistCleanup(); + retval = dictFind(server.cluster->nodes_black_list,id) != NULL; + sdsfree(id); + return retval; +} + +/* ----------------------------------------------------------------------------- + * CLUSTER messages exchange - PING/PONG and gossip + * -------------------------------------------------------------------------- */ + +/* This function checks if a given node should be marked as FAIL. + * It happens if the following conditions are met: + * + * 1) We received enough failure reports from other master nodes via gossip. + * Enough means that the majority of the masters signaled the node is + * down recently. + * 2) We believe this node is in PFAIL state. + * + * If a failure is detected we also inform the whole cluster about this + * event trying to force every other node to set the FAIL flag for the node. + * + * Note that the form of agreement used here is weak, as we collect the majority + * of masters state during some time, and even if we force agreement by + * propagating the FAIL message, because of partitions we may not reach every + * node. However: + * + * 1) Either we reach the majority and eventually the FAIL state will propagate + * to all the cluster. + * 2) Or there is no majority so no slave promotion will be authorized and the + * FAIL flag will be cleared after some time. + */ +void markNodeAsFailingIfNeeded(clusterNode *node) { + int failures; + int needed_quorum = (server.cluster->size / 2) + 1; + + if (!nodeTimedOut(node)) return; /* We can reach it. */ + if (nodeFailed(node)) return; /* Already FAILing. */ + + failures = clusterNodeFailureReportsCount(node); + /* Also count myself as a voter if I'm a master. */ + if (clusterNodeIsMaster(myself)) failures++; + if (failures < needed_quorum) return; /* No weak agreement from masters. */ + + serverLog(LL_NOTICE, + "Marking node %.40s (%s) as failing (quorum reached).", node->name, node->human_nodename); + + /* Mark the node as failing. */ + node->flags &= ~CLUSTER_NODE_PFAIL; + node->flags |= CLUSTER_NODE_FAIL; + node->fail_time = mstime(); + + /* Broadcast the failing node name to everybody, forcing all the other + * reachable nodes to flag the node as FAIL. + * We do that even if this node is a replica and not a master: anyway + * the failing state is triggered collecting failure reports from masters, + * so here the replica is only helping propagating this status. */ + clusterSendFail(node->name); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); +} + +/* This function is called only if a node is marked as FAIL, but we are able + * to reach it again. It checks if there are the conditions to undo the FAIL + * state. */ +void clearNodeFailureIfNeeded(clusterNode *node) { + mstime_t now = mstime(); + + serverAssert(nodeFailed(node)); + + /* For slaves we always clear the FAIL flag if we can contact the + * node again. */ + if (nodeIsSlave(node) || node->numslots == 0) { + serverLog(LL_NOTICE, + "Clear FAIL state for node %.40s (%s):%s is reachable again.", + node->name,node->human_nodename, + nodeIsSlave(node) ? "replica" : "master without slots"); + node->flags &= ~CLUSTER_NODE_FAIL; + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + } + + /* If it is a master and... + * 1) The FAIL state is old enough. + * 2) It is yet serving slots from our point of view (not failed over). + * Apparently no one is going to fix these slots, clear the FAIL flag. */ + if (clusterNodeIsMaster(node) && node->numslots > 0 && + (now - node->fail_time) > + (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT)) + { + serverLog(LL_NOTICE, + "Clear FAIL state for node %.40s (%s): is reachable again and nobody is serving its slots after some time.", + node->name, node->human_nodename); + node->flags &= ~CLUSTER_NODE_FAIL; + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + } +} + +/* Return true if we already have a node in HANDSHAKE state matching the + * specified ip address and port number. This function is used in order to + * avoid adding a new handshake node for the same address multiple times. */ +int clusterHandshakeInProgress(char *ip, int port, int cport) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (!nodeInHandshake(node)) continue; + if (!strcasecmp(node->ip,ip) && + getNodeDefaultClientPort(node) == port && + node->cport == cport) break; + } + dictReleaseIterator(di); + return de != NULL; +} + +/* Start a handshake with the specified address if there is not one + * already in progress. Returns non-zero if the handshake was actually + * started. On error zero is returned and errno is set to one of the + * following values: + * + * EAGAIN - There is already a handshake in progress for this address. + * EINVAL - IP or port are not valid. */ +int clusterStartHandshake(char *ip, int port, int cport) { + clusterNode *n; + char norm_ip[NET_IP_STR_LEN]; + struct sockaddr_storage sa; + + /* IP sanity check */ + if (inet_pton(AF_INET,ip, + &(((struct sockaddr_in *)&sa)->sin_addr))) + { + sa.ss_family = AF_INET; + } else if (inet_pton(AF_INET6,ip, + &(((struct sockaddr_in6 *)&sa)->sin6_addr))) + { + sa.ss_family = AF_INET6; + } else { + errno = EINVAL; + return 0; + } + + /* Port sanity check */ + if (port <= 0 || port > 65535 || cport <= 0 || cport > 65535) { + errno = EINVAL; + return 0; + } + + /* Set norm_ip as the normalized string representation of the node + * IP address. */ + memset(norm_ip,0,NET_IP_STR_LEN); + if (sa.ss_family == AF_INET) + inet_ntop(AF_INET, + (void*)&(((struct sockaddr_in *)&sa)->sin_addr), + norm_ip,NET_IP_STR_LEN); + else + inet_ntop(AF_INET6, + (void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr), + norm_ip,NET_IP_STR_LEN); + + if (clusterHandshakeInProgress(norm_ip,port,cport)) { + errno = EAGAIN; + return 0; + } + + /* Add the node with a random address (NULL as first argument to + * createClusterNode()). Everything will be fixed during the + * handshake. */ + n = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_MEET); + memcpy(n->ip,norm_ip,sizeof(n->ip)); + if (server.tls_cluster) { + n->tls_port = port; + } else { + n->tcp_port = port; + } + n->cport = cport; + clusterAddNode(n); + return 1; +} + +static void getClientPortFromClusterMsg(clusterMsg *hdr, int *tls_port, int *tcp_port) { + if (server.tls_cluster) { + *tls_port = ntohs(hdr->port); + *tcp_port = ntohs(hdr->pport); + } else { + *tls_port = ntohs(hdr->pport); + *tcp_port = ntohs(hdr->port); + } +} + +static void getClientPortFromGossip(clusterMsgDataGossip *g, int *tls_port, int *tcp_port) { + if (server.tls_cluster) { + *tls_port = ntohs(g->port); + *tcp_port = ntohs(g->pport); + } else { + *tls_port = ntohs(g->pport); + *tcp_port = ntohs(g->port); + } +} + +/* Returns a string with the byte representation of the node ID (i.e. nodename) + * along with 8 trailing bytes for debugging purposes. */ +char *getCorruptedNodeIdByteString(clusterMsgDataGossip *gossip_msg) { + const int num_bytes = CLUSTER_NAMELEN + 8; + /* Allocate enough room for 4 chars per byte + null terminator */ + char *byte_string = (char*) zmalloc((num_bytes*4) + 1); + const char *name_ptr = gossip_msg->nodename; + + /* Ensure we won't print beyond the bounds of the message */ + serverAssert(name_ptr + num_bytes <= (char*)gossip_msg + sizeof(clusterMsgDataGossip)); + + for (int i = 0; i < num_bytes; i++) { + snprintf(byte_string + 4*i, 5, "\\x%02hhX", name_ptr[i]); + } + return byte_string; +} + +/* Returns the number of nodes in the gossip with invalid IDs. */ +int verifyGossipSectionNodeIds(clusterMsgDataGossip *g, uint16_t count) { + int invalid_ids = 0; + for (int i = 0; i < count; i++) { + const char *nodename = g[i].nodename; + if (verifyClusterNodeId(nodename, CLUSTER_NAMELEN) != C_OK) { + invalid_ids++; + char *raw_node_id = getCorruptedNodeIdByteString(g); + serverLog(LL_WARNING, + "Received gossip about a node with invalid ID %.40s. For debugging purposes, " + "the 48 bytes including the invalid ID and 8 trailing bytes are: %s", + nodename, raw_node_id); + zfree(raw_node_id); + } + } + return invalid_ids; +} + +/* Process the gossip section of PING or PONG packets. + * Note that this function assumes that the packet is already sanity-checked + * by the caller, not in the content of the gossip section, but in the + * length. */ +void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { + uint16_t count = ntohs(hdr->count); + clusterMsgDataGossip *g = (clusterMsgDataGossip*) hdr->data.ping.gossip; + clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); + + /* Abort if the gossip contains invalid node IDs to avoid adding incorrect information to + * the nodes dictionary. An invalid ID indicates memory corruption on the sender side. */ + int invalid_ids = verifyGossipSectionNodeIds(g, count); + if (invalid_ids) { + if (sender) { + serverLog(LL_WARNING, "Node %.40s (%s) gossiped %d nodes with invalid IDs.", sender->name, sender->human_nodename, invalid_ids); + } else { + serverLog(LL_WARNING, "Unknown node gossiped %d nodes with invalid IDs.", invalid_ids); + } + return; + } + + while(count--) { + uint16_t flags = ntohs(g->flags); + clusterNode *node; + sds ci; + + if (server.verbosity == LL_DEBUG) { + ci = representClusterNodeFlags(sdsempty(), flags); + serverLog(LL_DEBUG,"GOSSIP %.40s %s:%d@%d %s", + g->nodename, + g->ip, + ntohs(g->port), + ntohs(g->cport), + ci); + sdsfree(ci); + } + + /* Convert port and pport into TCP port and TLS port. */ + int msg_tls_port, msg_tcp_port; + getClientPortFromGossip(g, &msg_tls_port, &msg_tcp_port); + + /* Update our state accordingly to the gossip sections */ + node = clusterLookupNode(g->nodename, CLUSTER_NAMELEN); + /* Ignore gossips about self. */ + if (node && node != myself) { + /* We already know this node. + Handle failure reports, only when the sender is a master. */ + if (sender && clusterNodeIsMaster(sender)) { + if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) { + if (clusterNodeAddFailureReport(node,sender)) { + serverLog(LL_VERBOSE, + "Node %.40s (%s) reported node %.40s (%s) as not reachable.", + sender->name, sender->human_nodename, node->name, node->human_nodename); + } + markNodeAsFailingIfNeeded(node); + } else { + if (clusterNodeDelFailureReport(node,sender)) { + serverLog(LL_VERBOSE, + "Node %.40s (%s) reported node %.40s (%s) is back online.", + sender->name, sender->human_nodename, node->name, node->human_nodename); + } + } + } + + /* If from our POV the node is up (no failure flags are set), + * we have no pending ping for the node, nor we have failure + * reports for this node, update the last pong time with the + * one we see from the other nodes. */ + if (!(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) && + node->ping_sent == 0 && + clusterNodeFailureReportsCount(node) == 0) + { + mstime_t pongtime = ntohl(g->pong_received); + pongtime *= 1000; /* Convert back to milliseconds. */ + + /* Replace the pong time with the received one only if + * it's greater than our view but is not in the future + * (with 500 milliseconds tolerance) from the POV of our + * clock. */ + if (pongtime <= (server.mstime+500) && + pongtime > node->pong_received) + { + node->pong_received = pongtime; + } + } + + /* If we already know this node, but it is not reachable, and + * we see a different address in the gossip section of a node that + * can talk with this other node, update the address, disconnect + * the old link if any, so that we'll attempt to connect with the + * new address. */ + if (node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL) && + !(flags & CLUSTER_NODE_NOADDR) && + !(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) && + (strcasecmp(node->ip,g->ip) || + node->tls_port != (server.tls_cluster ? ntohs(g->port) : ntohs(g->pport)) || + node->tcp_port != (server.tls_cluster ? ntohs(g->pport) : ntohs(g->port)) || + node->cport != ntohs(g->cport))) + { + if (node->link) freeClusterLink(node->link); + memcpy(node->ip,g->ip,NET_IP_STR_LEN); + node->tcp_port = msg_tcp_port; + node->tls_port = msg_tls_port; + node->cport = ntohs(g->cport); + node->flags &= ~CLUSTER_NODE_NOADDR; + } + } else if (!node) { + /* If it's not in NOADDR state and we don't have it, we + * add it to our trusted dict with exact nodeid and flag. + * Note that we cannot simply start a handshake against + * this IP/PORT pairs, since IP/PORT can be reused already, + * otherwise we risk joining another cluster. + * + * Note that we require that the sender of this gossip message + * is a well known node in our cluster, otherwise we risk + * joining another cluster. */ + if (sender && + !(flags & CLUSTER_NODE_NOADDR) && + !clusterBlacklistExists(g->nodename)) + { + clusterNode *node; + node = createClusterNode(g->nodename, flags); + memcpy(node->ip,g->ip,NET_IP_STR_LEN); + node->tcp_port = msg_tcp_port; + node->tls_port = msg_tls_port; + node->cport = ntohs(g->cport); + clusterAddNode(node); + clusterAddNodeToShard(node->shard_id, node); + } + } + + /* Next node */ + g++; + } +} + +/* IP -> string conversion. 'buf' is supposed to at least be 46 bytes. + * If 'announced_ip' length is non-zero, it is used instead of extracting + * the IP from the socket peer address. */ +int nodeIp2String(char *buf, clusterLink *link, char *announced_ip) { + if (announced_ip[0] != '\0') { + memcpy(buf,announced_ip,NET_IP_STR_LEN); + buf[NET_IP_STR_LEN-1] = '\0'; /* We are not sure the input is sane. */ + return C_OK; + } else { + if (connAddrPeerName(link->conn, buf, NET_IP_STR_LEN, NULL) == -1) { + serverLog(LL_NOTICE, "Error converting peer IP to string: %s", + link->conn ? connGetLastError(link->conn) : "no link"); + return C_ERR; + } + return C_OK; + } +} + +/* Update the node address to the IP address that can be extracted + * from link->fd, or if hdr->myip is non empty, to the address the node + * is announcing us. The port is taken from the packet header as well. + * + * If the address or port changed, disconnect the node link so that we'll + * connect again to the new address. + * + * If the ip/port pair are already correct no operation is performed at + * all. + * + * The function returns 0 if the node address is still the same, + * otherwise 1 is returned. */ +int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, + clusterMsg *hdr) +{ + char ip[NET_IP_STR_LEN] = {0}; + int cport = ntohs(hdr->cport); + int tcp_port, tls_port; + getClientPortFromClusterMsg(hdr, &tls_port, &tcp_port); + + /* We don't proceed if the link is the same as the sender link, as this + * function is designed to see if the node link is consistent with the + * symmetric link that is used to receive PINGs from the node. + * + * As a side effect this function never frees the passed 'link', so + * it is safe to call during packet processing. */ + if (link == node->link) return 0; + + /* If the peer IP is unavailable for some reasons like invalid fd or closed + * link, just give up the update this time, and the update will be retried + * in the next round of PINGs */ + if (nodeIp2String(ip,link,hdr->myip) == C_ERR) return 0; + + if (node->tcp_port == tcp_port && node->cport == cport && node->tls_port == tls_port && + strcmp(ip,node->ip) == 0) return 0; + + /* IP / port is different, update it. */ + memcpy(node->ip,ip,sizeof(ip)); + node->tcp_port = tcp_port; + node->tls_port = tls_port; + node->cport = cport; + if (node->link) freeClusterLink(node->link); + node->flags &= ~CLUSTER_NODE_NOADDR; + serverLog(LL_NOTICE,"Address updated for node %.40s (%s), now %s:%d", + node->name, node->human_nodename, node->ip, getNodeDefaultClientPort(node)); + + /* Check if this is our master and we have to change the + * replication target as well. */ + if (nodeIsSlave(myself) && myself->slaveof == node) + replicationSetMaster(node->ip, getNodeDefaultReplicationPort(node)); + return 1; +} + +/* Reconfigure the specified node 'n' as a master. This function is called when + * a node that we believed to be a slave is now acting as master in order to + * update the state of the node. */ +void clusterSetNodeAsMaster(clusterNode *n) { + if (clusterNodeIsMaster(n)) return; + + if (n->slaveof) { + clusterNodeRemoveSlave(n->slaveof,n); + if (n != myself) n->flags |= CLUSTER_NODE_MIGRATE_TO; + } + n->flags &= ~CLUSTER_NODE_SLAVE; + n->flags |= CLUSTER_NODE_MASTER; + n->slaveof = NULL; + + /* Update config and state. */ + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); +} + +/* This function is called when we receive a master configuration via a + * PING, PONG or UPDATE packet. What we receive is a node, a configEpoch of the + * node, and the set of slots claimed under this configEpoch. + * + * What we do is to rebind the slots with newer configuration compared to our + * local configuration, and if needed, we turn ourself into a replica of the + * node (see the function comments for more info). + * + * The 'sender' is the node for which we received a configuration update. + * Sometimes it is not actually the "Sender" of the information, like in the + * case we receive the info via an UPDATE packet. */ +void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, unsigned char *slots) { + int j; + clusterNode *curmaster = NULL, *newmaster = NULL; + /* The dirty slots list is a list of slots for which we lose the ownership + * while having still keys inside. This usually happens after a failover + * or after a manual cluster reconfiguration operated by the admin. + * + * If the update message is not able to demote a master to slave (in this + * case we'll resync with the master updating the whole key space), we + * need to delete all the keys in the slots we lost ownership. */ + uint16_t dirty_slots[CLUSTER_SLOTS]; + int dirty_slots_count = 0; + + /* We should detect if sender is new master of our shard. + * We will know it if all our slots were migrated to sender, and sender + * has no slots except ours */ + int sender_slots = 0; + int migrated_our_slots = 0; + + /* Here we set curmaster to this node or the node this node + * replicates to if it's a slave. In the for loop we are + * interested to check if slots are taken away from curmaster. */ + curmaster = clusterNodeIsMaster(myself) ? myself : myself->slaveof; + + if (sender == myself) { + serverLog(LL_NOTICE,"Discarding UPDATE message about myself."); + return; + } + + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (bitmapTestBit(slots,j)) { + sender_slots++; + + /* The slot is already bound to the sender of this message. */ + if (server.cluster->slots[j] == sender) { + bitmapClearBit(server.cluster->owner_not_claiming_slot, j); + continue; + } + + /* The slot is in importing state, it should be modified only + * manually via redis-cli (example: a resharding is in progress + * and the migrating side slot was already closed and is advertising + * a new config. We still want the slot to be closed manually). */ + if (server.cluster->importing_slots_from[j]) continue; + + /* We rebind the slot to the new node claiming it if: + * 1) The slot was unassigned or the previous owner no longer owns the slot or + * the new node claims it with a greater configEpoch. + * 2) We are not currently importing the slot. */ + if (isSlotUnclaimed(j) || + server.cluster->slots[j]->configEpoch < senderConfigEpoch) + { + /* Was this slot mine, and still contains keys? Mark it as + * a dirty slot. */ + if (server.cluster->slots[j] == myself && + countKeysInSlot(j) && + sender != myself) + { + dirty_slots[dirty_slots_count] = j; + dirty_slots_count++; + } + + if (server.cluster->slots[j] == curmaster) { + newmaster = sender; + migrated_our_slots++; + } + clusterDelSlot(j); + clusterAddSlot(sender,j); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); + } + } else if (server.cluster->slots[j] == sender) { + /* The slot is currently bound to the sender but the sender is no longer + * claiming it. We don't want to unbind the slot yet as it can cause the cluster + * to move to FAIL state and also throw client error. Keeping the slot bound to + * the previous owner will cause a few client side redirects, but won't throw + * any errors. We will keep track of the uncertainty in ownership to avoid + * propagating misinformation about this slot's ownership using UPDATE + * messages. */ + bitmapSetBit(server.cluster->owner_not_claiming_slot, j); + } + } + + /* After updating the slots configuration, don't do any actual change + * in the state of the server if a module disabled Redis Cluster + * keys redirections. */ + if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) + return; + + /* If at least one slot was reassigned from a node to another node + * with a greater configEpoch, it is possible that: + * 1) We are a master left without slots. This means that we were + * failed over and we should turn into a replica of the new + * master. + * 2) We are a slave and our master is left without slots. We need + * to replicate to the new slots owner. */ + if (newmaster && curmaster->numslots == 0 && + (server.cluster_allow_replica_migration || + sender_slots == migrated_our_slots)) { + serverLog(LL_NOTICE, + "Configuration change detected. Reconfiguring myself " + "as a replica of %.40s (%s)", sender->name, sender->human_nodename); + clusterSetMaster(sender); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); + } else if (myself->slaveof && myself->slaveof->slaveof && + /* In some rare case when CLUSTER FAILOVER TAKEOVER is used, it + * can happen that myself is a replica of a replica of myself. If + * this happens, we do nothing to avoid a crash and wait for the + * admin to repair the cluster. */ + myself->slaveof->slaveof != myself) + { + /* Safeguard against sub-replicas. A replica's master can turn itself + * into a replica if its last slot is removed. If no other node takes + * over the slot, there is nothing else to trigger replica migration. */ + serverLog(LL_NOTICE, + "I'm a sub-replica! Reconfiguring myself as a replica of grandmaster %.40s (%s)", + myself->slaveof->slaveof->name, myself->slaveof->slaveof->human_nodename); + clusterSetMaster(myself->slaveof->slaveof); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); + } else if (dirty_slots_count) { + /* If we are here, we received an update message which removed + * ownership for certain slots we still have keys about, but still + * we are serving some slots, so this master node was not demoted to + * a slave. + * + * In order to maintain a consistent state between keys and slots + * we need to remove all the keys from the slots we lost. */ + for (j = 0; j < dirty_slots_count; j++) + delKeysInSlot(dirty_slots[j]); + } +} + +/* Cluster ping extensions. + * + * The ping/pong/meet messages support arbitrary extensions to add additional + * metadata to the messages that are sent between the various nodes in the + * cluster. The extensions take the form: + * [ Header length + type (8 bytes) ] + * [ Extension information (Arbitrary length, but must be 8 byte padded) ] + */ + + +/* Returns the length of a given extension */ +static uint32_t getPingExtLength(clusterMsgPingExt *ext) { + return ntohl(ext->length); +} + +/* Returns the initial position of ping extensions. May return an invalid + * address if there are no ping extensions. */ +static clusterMsgPingExt *getInitialPingExt(clusterMsg *hdr, int count) { + clusterMsgPingExt *initial = (clusterMsgPingExt*) &(hdr->data.ping.gossip[count]); + return initial; +} + +/* Given a current ping extension, returns the start of the next extension. May return + * an invalid address if there are no further ping extensions. */ +static clusterMsgPingExt *getNextPingExt(clusterMsgPingExt *ext) { + clusterMsgPingExt *next = (clusterMsgPingExt *) (((char *) ext) + getPingExtLength(ext)); + return next; +} + +/* All PING extensions must be 8-byte aligned */ +uint32_t getAlignedPingExtSize(uint32_t dataSize) { + + return sizeof(clusterMsgPingExt) + EIGHT_BYTE_ALIGN(dataSize); +} + +uint32_t getHostnamePingExtSize(void) { + if (sdslen(myself->hostname) == 0) { + return 0; + } + return getAlignedPingExtSize(sdslen(myself->hostname) + 1); +} + +uint32_t getHumanNodenamePingExtSize(void) { + if (sdslen(myself->human_nodename) == 0) { + return 0; + } + return getAlignedPingExtSize(sdslen(myself->human_nodename) + 1); +} + +uint32_t getShardIdPingExtSize(void) { + return getAlignedPingExtSize(sizeof(clusterMsgPingExtShardId)); +} + +uint32_t getForgottenNodeExtSize(void) { + return getAlignedPingExtSize(sizeof(clusterMsgPingExtForgottenNode)); +} + +void *preparePingExt(clusterMsgPingExt *ext, uint16_t type, uint32_t length) { + ext->type = htons(type); + ext->length = htonl(length); + return &ext->ext[0]; +} + +clusterMsgPingExt *nextPingExt(clusterMsgPingExt *ext) { + return (clusterMsgPingExt *)((char*)ext + ntohl(ext->length)); +} + +/* 1. If a NULL hdr is provided, compute the extension size; + * 2. If a non-NULL hdr is provided, write the hostname ping + * extension at the start of the cursor. This function + * will update the cursor to point to the end of the + * written extension and will return the amount of bytes + * written. */ +uint32_t writePingExt(clusterMsg *hdr, int gossipcount) { + uint16_t extensions = 0; + uint32_t totlen = 0; + clusterMsgPingExt *cursor = NULL; + /* Set the initial extension position */ + if (hdr != NULL) { + cursor = getInitialPingExt(hdr, gossipcount); + } + + /* hostname is optional */ + if (sdslen(myself->hostname) != 0) { + if (cursor != NULL) { + /* Populate hostname */ + clusterMsgPingExtHostname *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HOSTNAME, getHostnamePingExtSize()); + memcpy(ext->hostname, myself->hostname, sdslen(myself->hostname)); + + /* Move the write cursor */ + cursor = nextPingExt(cursor); + } + + totlen += getHostnamePingExtSize(); + extensions++; + } + + if (sdslen(myself->human_nodename) != 0) { + if (cursor != NULL) { + /* Populate human_nodename */ + clusterMsgPingExtHumanNodename *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, getHumanNodenamePingExtSize()); + memcpy(ext->human_nodename, myself->human_nodename, sdslen(myself->human_nodename)); + + /* Move the write cursor */ + cursor = nextPingExt(cursor); + } + + totlen += getHumanNodenamePingExtSize(); + extensions++; + } + + /* Gossip forgotten nodes */ + if (dictSize(server.cluster->nodes_black_list) > 0) { + dictIterator *di = dictGetIterator(server.cluster->nodes_black_list); + dictEntry *de; + while ((de = dictNext(di)) != NULL) { + if (cursor != NULL) { + uint64_t expire = dictGetUnsignedIntegerVal(de); + if ((time_t)expire < server.unixtime) continue; /* already expired */ + uint64_t ttl = expire - server.unixtime; + clusterMsgPingExtForgottenNode *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, getForgottenNodeExtSize()); + memcpy(ext->name, dictGetKey(de), CLUSTER_NAMELEN); + ext->ttl = htonu64(ttl); + + /* Move the write cursor */ + cursor = nextPingExt(cursor); + } + totlen += getForgottenNodeExtSize(); + extensions++; + } + dictReleaseIterator(di); + } + + /* Populate shard_id */ + if (cursor != NULL) { + clusterMsgPingExtShardId *ext = preparePingExt(cursor, CLUSTERMSG_EXT_TYPE_SHARDID, getShardIdPingExtSize()); + memcpy(ext->shard_id, myself->shard_id, CLUSTER_NAMELEN); + + /* Move the write cursor */ + cursor = nextPingExt(cursor); + } + totlen += getShardIdPingExtSize(); + extensions++; + + if (hdr != NULL) { + if (extensions != 0) { + hdr->mflags[0] |= CLUSTERMSG_FLAG0_EXT_DATA; + } + hdr->extensions = htons(extensions); + } + + return totlen; +} + +/* We previously validated the extensions, so this function just needs to + * handle the extensions. */ +void clusterProcessPingExtensions(clusterMsg *hdr, clusterLink *link) { + clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); + char *ext_hostname = NULL; + char *ext_humannodename = NULL; + char *ext_shardid = NULL; + uint16_t extensions = ntohs(hdr->extensions); + /* Loop through all the extensions and process them */ + clusterMsgPingExt *ext = getInitialPingExt(hdr, ntohs(hdr->count)); + while (extensions--) { + uint16_t type = ntohs(ext->type); + if (type == CLUSTERMSG_EXT_TYPE_HOSTNAME) { + clusterMsgPingExtHostname *hostname_ext = (clusterMsgPingExtHostname *) &(ext->ext[0].hostname); + ext_hostname = hostname_ext->hostname; + } else if (type == CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME) { + clusterMsgPingExtHumanNodename *humannodename_ext = (clusterMsgPingExtHumanNodename *) &(ext->ext[0].human_nodename); + ext_humannodename = humannodename_ext->human_nodename; + } else if (type == CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE) { + clusterMsgPingExtForgottenNode *forgotten_node_ext = &(ext->ext[0].forgotten_node); + clusterNode *n = clusterLookupNode(forgotten_node_ext->name, CLUSTER_NAMELEN); + if (n && n != myself && !(nodeIsSlave(myself) && myself->slaveof == n)) { + sds id = sdsnewlen(forgotten_node_ext->name, CLUSTER_NAMELEN); + dictEntry *de = dictAddOrFind(server.cluster->nodes_black_list, id); + uint64_t expire = server.unixtime + ntohu64(forgotten_node_ext->ttl); + dictSetUnsignedIntegerVal(de, expire); + clusterDelNode(n); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_SAVE_CONFIG); + } + } else if (type == CLUSTERMSG_EXT_TYPE_SHARDID) { + clusterMsgPingExtShardId *shardid_ext = (clusterMsgPingExtShardId *) &(ext->ext[0].shard_id); + ext_shardid = shardid_ext->shard_id; + } else { + /* Unknown type, we will ignore it but log what happened. */ + serverLog(LL_WARNING, "Received unknown extension type %d", type); + } + + /* We know this will be valid since we validated it ahead of time */ + ext = getNextPingExt(ext); + } + + /* If the node did not send us a hostname extension, assume + * they don't have an announced hostname. Otherwise, we'll + * set it now. */ + updateAnnouncedHostname(sender, ext_hostname); + updateAnnouncedHumanNodename(sender, ext_humannodename); + /* If the node did not send us a shard-id extension, it means the sender + * does not support it (old version), node->shard_id is randomly generated. + * A cluster-wide consensus for the node's shard_id is not necessary. + * The key is maintaining consistency of the shard_id on each individual 7.2 node. + * As the cluster progressively upgrades to version 7.2, we can expect the shard_ids + * across all nodes to naturally converge and align. + * + * If sender is a replica, set the shard_id to the shard_id of its master. + * Otherwise, we'll set it now. */ + if (ext_shardid == NULL) ext_shardid = clusterNodeGetMaster(sender)->shard_id; + + updateShardId(sender, ext_shardid); +} + +static clusterNode *getNodeFromLinkAndMsg(clusterLink *link, clusterMsg *hdr) { + clusterNode *sender; + if (link->node && !nodeInHandshake(link->node)) { + /* If the link has an associated node, use that so that we don't have to look it + * up every time, except when the node is still in handshake, the node still has + * a random name thus not truly "known". */ + sender = link->node; + } else { + /* Otherwise, fetch sender based on the message */ + sender = clusterLookupNode(hdr->sender, CLUSTER_NAMELEN); + /* We know the sender node but haven't associate it with the link. This must + * be an inbound link because only for inbound links we didn't know which node + * to associate when they were created. */ + if (sender && !link->node) { + setClusterNodeToInboundClusterLink(sender, link); + } + } + return sender; +} + +/* When this function is called, there is a packet to process starting + * at link->rcvbuf. Releasing the buffer is up to the caller, so this + * function should just handle the higher level stuff of processing the + * packet, modifying the cluster state if needed. + * + * The function returns 1 if the link is still valid after the packet + * was processed, otherwise 0 if the link was freed since the packet + * processing lead to some inconsistency error (for instance a PONG + * received from the wrong sender ID). */ +int clusterProcessPacket(clusterLink *link) { + clusterMsg *hdr = (clusterMsg*) link->rcvbuf; + uint32_t totlen = ntohl(hdr->totlen); + uint16_t type = ntohs(hdr->type); + mstime_t now = mstime(); + + if (type < CLUSTERMSG_TYPE_COUNT) + server.cluster->stats_bus_messages_received[type]++; + serverLog(LL_DEBUG,"--- Processing packet of type %s, %lu bytes", + clusterGetMessageTypeString(type), (unsigned long) totlen); + + /* Perform sanity checks */ + if (totlen < 16) return 1; /* At least signature, version, totlen, count. */ + if (totlen > link->rcvbuf_len) return 1; + + if (ntohs(hdr->ver) != CLUSTER_PROTO_VER) { + /* Can't handle messages of different versions. */ + return 1; + } + + if (type == server.cluster_drop_packet_filter) { + serverLog(LL_WARNING, "Dropping packet that matches debug drop filter"); + return 1; + } + + uint16_t flags = ntohs(hdr->flags); + uint16_t extensions = ntohs(hdr->extensions); + uint64_t senderCurrentEpoch = 0, senderConfigEpoch = 0; + uint32_t explen; /* expected length of this packet */ + clusterNode *sender; + + if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || + type == CLUSTERMSG_TYPE_MEET) + { + uint16_t count = ntohs(hdr->count); + + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + explen += (sizeof(clusterMsgDataGossip)*count); + + /* If there is extension data, which doesn't have a fixed length, + * loop through them and validate the length of it now. */ + if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { + clusterMsgPingExt *ext = getInitialPingExt(hdr, count); + while (extensions--) { + uint16_t extlen = getPingExtLength(ext); + if (extlen % 8 != 0) { + serverLog(LL_WARNING, "Received a %s packet without proper padding (%d bytes)", + clusterGetMessageTypeString(type), (int) extlen); + return 1; + } + if ((totlen - explen) < extlen) { + serverLog(LL_WARNING, "Received invalid %s packet with extension data that exceeds " + "total packet length (%lld)", clusterGetMessageTypeString(type), + (unsigned long long) totlen); + return 1; + } + explen += extlen; + ext = getNextPingExt(ext); + } + } + } else if (type == CLUSTERMSG_TYPE_FAIL) { + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + explen += sizeof(clusterMsgDataFail); + } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) { + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + explen += sizeof(clusterMsgDataPublish) - + 8 + + ntohl(hdr->data.publish.msg.channel_len) + + ntohl(hdr->data.publish.msg.message_len); + } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST || + type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK || + type == CLUSTERMSG_TYPE_MFSTART) + { + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + } else if (type == CLUSTERMSG_TYPE_UPDATE) { + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + explen += sizeof(clusterMsgDataUpdate); + } else if (type == CLUSTERMSG_TYPE_MODULE) { + explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + explen += sizeof(clusterMsgModule) - + 3 + ntohl(hdr->data.module.msg.len); + } else { + /* We don't know this type of packet, so we assume it's well formed. */ + explen = totlen; + } + + if (totlen != explen) { + serverLog(LL_WARNING, "Received invalid %s packet of length %lld but expected length %lld", + clusterGetMessageTypeString(type), (unsigned long long) totlen, (unsigned long long) explen); + return 1; + } + + sender = getNodeFromLinkAndMsg(link, hdr); + + /* Update the last time we saw any data from this node. We + * use this in order to avoid detecting a timeout from a node that + * is just sending a lot of data in the cluster bus, for instance + * because of Pub/Sub. */ + if (sender) sender->data_received = now; + + if (sender && !nodeInHandshake(sender)) { + /* Update our currentEpoch if we see a newer epoch in the cluster. */ + senderCurrentEpoch = ntohu64(hdr->currentEpoch); + senderConfigEpoch = ntohu64(hdr->configEpoch); + if (senderCurrentEpoch > server.cluster->currentEpoch) + server.cluster->currentEpoch = senderCurrentEpoch; + /* Update the sender configEpoch if it is publishing a newer one. */ + if (senderConfigEpoch > sender->configEpoch) { + sender->configEpoch = senderConfigEpoch; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_FSYNC_CONFIG); + } + /* Update the replication offset info for this node. */ + sender->repl_offset = ntohu64(hdr->offset); + sender->repl_offset_time = now; + /* If we are a slave performing a manual failover and our master + * sent its offset while already paused, populate the MF state. */ + if (server.cluster->mf_end && + nodeIsSlave(myself) && + myself->slaveof == sender && + hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED && + server.cluster->mf_master_offset == -1) + { + server.cluster->mf_master_offset = sender->repl_offset; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); + serverLog(LL_NOTICE, + "Received replication offset for paused " + "master manual failover: %lld", + server.cluster->mf_master_offset); + } + } + + /* Initial processing of PING and MEET requests replying with a PONG. */ + if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_MEET) { + /* We use incoming MEET messages in order to set the address + * for 'myself', since only other cluster nodes will send us + * MEET messages on handshakes, when the cluster joins, or + * later if we changed address, and those nodes will use our + * official address to connect to us. So by obtaining this address + * from the socket is a simple way to discover / update our own + * address in the cluster without it being hardcoded in the config. + * + * However if we don't have an address at all, we update the address + * even with a normal PING packet. If it's wrong it will be fixed + * by MEET later. */ + if ((type == CLUSTERMSG_TYPE_MEET || myself->ip[0] == '\0') && + server.cluster_announce_ip == NULL) + { + char ip[NET_IP_STR_LEN]; + + if (connAddrSockName(link->conn,ip,sizeof(ip),NULL) != -1 && + strcmp(ip,myself->ip)) + { + memcpy(myself->ip,ip,NET_IP_STR_LEN); + serverLog(LL_NOTICE,"IP address for this node updated to %s", + myself->ip); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } + } + + /* Add this node if it is new for us and the msg type is MEET. + * In this stage we don't try to add the node with the right + * flags, slaveof pointer, and so forth, as this details will be + * resolved when we'll receive PONGs from the node. */ + if (!sender && type == CLUSTERMSG_TYPE_MEET) { + clusterNode *node; + + node = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE); + serverAssert(nodeIp2String(node->ip,link,hdr->myip) == C_OK); + getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); + node->cport = ntohs(hdr->cport); + clusterAddNode(node); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } + + /* If this is a MEET packet from an unknown node, we still process + * the gossip section here since we have to trust the sender because + * of the message type. */ + if (!sender && type == CLUSTERMSG_TYPE_MEET) + clusterProcessGossipSection(hdr,link); + + /* Anyway reply with a PONG */ + clusterSendPing(link,CLUSTERMSG_TYPE_PONG); + } + + /* PING, PONG, MEET: process config information. */ + if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || + type == CLUSTERMSG_TYPE_MEET) + { + serverLog(LL_DEBUG,"%s packet received: %.40s", + clusterGetMessageTypeString(type), + link->node ? link->node->name : "NULL"); + if (!link->inbound) { + if (nodeInHandshake(link->node)) { + /* If we already have this node, try to change the + * IP/port of the node with the new one. */ + if (sender) { + serverLog(LL_VERBOSE, + "Handshake: we already know node %.40s (%s), " + "updating the address if needed.", sender->name, sender->human_nodename); + if (nodeUpdateAddressIfNeeded(sender,link,hdr)) + { + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } + /* Free this node as we already have it. This will + * cause the link to be freed as well. */ + clusterDelNode(link->node); + return 0; + } + + /* First thing to do is replacing the random name with the + * right node name if this was a handshake stage. */ + clusterRenameNode(link->node, hdr->sender); + serverLog(LL_DEBUG,"Handshake with node %.40s completed.", + link->node->name); + link->node->flags &= ~CLUSTER_NODE_HANDSHAKE; + link->node->flags |= flags&(CLUSTER_NODE_MASTER|CLUSTER_NODE_SLAVE); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } else if (memcmp(link->node->name,hdr->sender, + CLUSTER_NAMELEN) != 0) + { + /* If the reply has a non matching node ID we + * disconnect this node and set it as not having an associated + * address. */ + serverLog(LL_DEBUG,"PONG contains mismatching sender ID. About node %.40s added %d ms ago, having flags %d", + link->node->name, + (int)(now-(link->node->ctime)), + link->node->flags); + link->node->flags |= CLUSTER_NODE_NOADDR; + link->node->ip[0] = '\0'; + link->node->tcp_port = 0; + link->node->tls_port = 0; + link->node->cport = 0; + freeClusterLink(link); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + return 0; + } + } + + /* Copy the CLUSTER_NODE_NOFAILOVER flag from what the sender + * announced. This is a dynamic flag that we receive from the + * sender, and the latest status must be trusted. We need it to + * be propagated because the slave ranking used to understand the + * delay of each slave in the voting process, needs to know + * what are the instances really competing. */ + if (sender) { + int nofailover = flags & CLUSTER_NODE_NOFAILOVER; + sender->flags &= ~CLUSTER_NODE_NOFAILOVER; + sender->flags |= nofailover; + } + + /* Update the node address if it changed. */ + if (sender && type == CLUSTERMSG_TYPE_PING && + !nodeInHandshake(sender) && + nodeUpdateAddressIfNeeded(sender,link,hdr)) + { + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } + + /* Update our info about the node */ + if (!link->inbound && type == CLUSTERMSG_TYPE_PONG) { + link->node->pong_received = now; + link->node->ping_sent = 0; + + /* The PFAIL condition can be reversed without external + * help if it is momentary (that is, if it does not + * turn into a FAIL state). + * + * The FAIL condition is also reversible under specific + * conditions detected by clearNodeFailureIfNeeded(). */ + if (nodeTimedOut(link->node)) { + link->node->flags &= ~CLUSTER_NODE_PFAIL; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } else if (nodeFailed(link->node)) { + clearNodeFailureIfNeeded(link->node); + } + } + + /* Check for role switch: slave -> master or master -> slave. */ + if (sender) { + if (!memcmp(hdr->slaveof,CLUSTER_NODE_NULL_NAME, + sizeof(hdr->slaveof))) + { + /* Node is a master. */ + clusterSetNodeAsMaster(sender); + } else { + /* Node is a slave. */ + clusterNode *master = clusterLookupNode(hdr->slaveof, CLUSTER_NAMELEN); + + if (clusterNodeIsMaster(sender)) { + /* Master turned into a slave! Reconfigure the node. */ + if (master && !memcmp(master->shard_id, sender->shard_id, CLUSTER_NAMELEN)) { + /* `sender` was a primary and was in the same shard as `master`, its new primary */ + if (sender->configEpoch > senderConfigEpoch) { + serverLog(LL_NOTICE, + "Ignore stale message from %.40s (%s) in shard %.40s;" + " gossip config epoch: %llu, current config epoch: %llu", + sender->name, + sender->human_nodename, + sender->shard_id, + (unsigned long long)senderConfigEpoch, + (unsigned long long)sender->configEpoch); + } else { + /* A failover occurred in the shard where `sender` belongs to and `sender` is no longer + * a primary. Update slot assignment to `master`, which is the new primary in the shard */ + int slots = clusterMoveNodeSlots(sender, master); + /* `master` is still a `slave` in this observer node's view; update its role and configEpoch */ + clusterSetNodeAsMaster(master); + master->configEpoch = senderConfigEpoch; + serverLog(LL_NOTICE, "A failover occurred in shard %.40s; node %.40s (%s)" + " lost %d slot(s) to node %.40s (%s) with a config epoch of %llu", + sender->shard_id, + sender->name, + sender->human_nodename, + slots, + master->name, + master->human_nodename, + (unsigned long long) master->configEpoch); + } + } else { + /* `sender` was moved to another shard and has become a replica, remove its slot assignment */ + int slots = clusterDelNodeSlots(sender); + serverLog(LL_NOTICE, "Node %.40s (%s) is no longer master of shard %.40s;" + " removed all %d slot(s) it used to own", + sender->name, + sender->human_nodename, + sender->shard_id, + slots); + if (master != NULL) { + serverLog(LL_NOTICE, "Node %.40s (%s) is now part of shard %.40s", + sender->name, + sender->human_nodename, + master->shard_id); + } + } + sender->flags &= ~(CLUSTER_NODE_MASTER| + CLUSTER_NODE_MIGRATE_TO); + sender->flags |= CLUSTER_NODE_SLAVE; + + /* Update config and state. */ + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } + + /* Master node changed for this slave? */ + if (master && sender->slaveof != master) { + if (sender->slaveof) + clusterNodeRemoveSlave(sender->slaveof,sender); + clusterNodeAddSlave(master,sender); + sender->slaveof = master; + + /* Update the shard_id when a replica is connected to its + * primary in the very first time. */ + updateShardId(sender, master->shard_id); + + /* Update config. */ + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + } + } + } + + /* Update our info about served slots. + * + * Note: this MUST happen after we update the master/slave state + * so that CLUSTER_NODE_MASTER flag will be set. */ + + /* Many checks are only needed if the set of served slots this + * instance claims is different compared to the set of slots we have + * for it. Check this ASAP to avoid other computational expansive + * checks later. */ + clusterNode *sender_master = NULL; /* Sender or its master if slave. */ + int dirty_slots = 0; /* Sender claimed slots don't match my view? */ + + if (sender) { + sender_master = clusterNodeIsMaster(sender) ? sender : sender->slaveof; + if (sender_master) { + dirty_slots = memcmp(sender_master->slots, + hdr->myslots,sizeof(hdr->myslots)) != 0; + } + } + + /* 1) If the sender of the message is a master, and we detected that + * the set of slots it claims changed, scan the slots to see if we + * need to update our configuration. */ + if (sender && clusterNodeIsMaster(sender) && dirty_slots) + clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots); + + /* 2) We also check for the reverse condition, that is, the sender + * claims to serve slots we know are served by a master with a + * greater configEpoch. If this happens we inform the sender. + * + * This is useful because sometimes after a partition heals, a + * reappearing master may be the last one to claim a given set of + * hash slots, but with a configuration that other instances know to + * be deprecated. Example: + * + * A and B are master and slave for slots 1,2,3. + * A is partitioned away, B gets promoted. + * B is partitioned away, and A returns available. + * + * Usually B would PING A publishing its set of served slots and its + * configEpoch, but because of the partition B can't inform A of the + * new configuration, so other nodes that have an updated table must + * do it. In this way A will stop to act as a master (or can try to + * failover if there are the conditions to win the election). */ + if (sender && dirty_slots) { + int j; + + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (bitmapTestBit(hdr->myslots,j)) { + if (server.cluster->slots[j] == sender || + isSlotUnclaimed(j)) continue; + if (server.cluster->slots[j]->configEpoch > + senderConfigEpoch) + { + serverLog(LL_VERBOSE, + "Node %.40s has old slots configuration, sending " + "an UPDATE message about %.40s", + sender->name, server.cluster->slots[j]->name); + clusterSendUpdate(sender->link, + server.cluster->slots[j]); + + /* TODO: instead of exiting the loop send every other + * UPDATE packet for other nodes that are the new owner + * of sender's slots. */ + break; + } + } + } + } + + /* If our config epoch collides with the sender's try to fix + * the problem. */ + if (sender && clusterNodeIsMaster(myself) && clusterNodeIsMaster(sender) && + senderConfigEpoch == myself->configEpoch) + { + clusterHandleConfigEpochCollision(sender); + } + + /* Get info from the gossip section */ + if (sender) { + clusterProcessGossipSection(hdr,link); + clusterProcessPingExtensions(hdr,link); + } + } else if (type == CLUSTERMSG_TYPE_FAIL) { + clusterNode *failing; + + if (sender) { + failing = clusterLookupNode(hdr->data.fail.about.nodename, CLUSTER_NAMELEN); + if (failing && + !(failing->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_MYSELF))) + { + serverLog(LL_NOTICE, + "FAIL message received from %.40s (%s) about %.40s (%s)", + hdr->sender, sender->human_nodename, hdr->data.fail.about.nodename, failing->human_nodename); + failing->flags |= CLUSTER_NODE_FAIL; + failing->fail_time = now; + failing->flags &= ~CLUSTER_NODE_PFAIL; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); + } + } else { + serverLog(LL_NOTICE, + "Ignoring FAIL message from unknown node %.40s about %.40s", + hdr->sender, hdr->data.fail.about.nodename); + } + } else if (type == CLUSTERMSG_TYPE_PUBLISH || type == CLUSTERMSG_TYPE_PUBLISHSHARD) { + if (!sender) return 1; /* We don't know that node. */ + + robj *channel, *message; + uint32_t channel_len, message_len; + + /* Don't bother creating useless objects if there are no + * Pub/Sub subscribers. */ + if ((type == CLUSTERMSG_TYPE_PUBLISH + && serverPubsubSubscriptionCount() > 0) + || (type == CLUSTERMSG_TYPE_PUBLISHSHARD + && serverPubsubShardSubscriptionCount() > 0)) + { + channel_len = ntohl(hdr->data.publish.msg.channel_len); + message_len = ntohl(hdr->data.publish.msg.message_len); + channel = createStringObject( + (char*)hdr->data.publish.msg.bulk_data,channel_len); + message = createStringObject( + (char*)hdr->data.publish.msg.bulk_data+channel_len, + message_len); + pubsubPublishMessage(channel, message, type == CLUSTERMSG_TYPE_PUBLISHSHARD); + decrRefCount(channel); + decrRefCount(message); + } + } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) { + if (!sender) return 1; /* We don't know that node. */ + clusterSendFailoverAuthIfNeeded(sender,hdr); + } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) { + if (!sender) return 1; /* We don't know that node. */ + /* We consider this vote only if the sender is a master serving + * a non zero number of slots, and its currentEpoch is greater or + * equal to epoch where this node started the election. */ + if (clusterNodeIsMaster(sender) && sender->numslots > 0 && + senderCurrentEpoch >= server.cluster->failover_auth_epoch) + { + server.cluster->failover_auth_count++; + /* Maybe we reached a quorum here, set a flag to make sure + * we check ASAP. */ + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + } + } else if (type == CLUSTERMSG_TYPE_MFSTART) { + /* This message is acceptable only if I'm a master and the sender + * is one of my slaves. */ + if (!sender || sender->slaveof != myself) return 1; + /* Manual failover requested from slaves. Initialize the state + * accordingly. */ + resetManualFailover(); + server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT; + server.cluster->mf_slave = sender; + pauseActions(PAUSE_DURING_FAILOVER, + now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT), + PAUSE_ACTIONS_CLIENT_WRITE_SET); + serverLog(LL_NOTICE,"Manual failover requested by replica %.40s (%s).", + sender->name, sender->human_nodename); + /* We need to send a ping message to the replica, as it would carry + * `server.cluster->mf_master_offset`, which means the master paused clients + * at offset `server.cluster->mf_master_offset`, so that the replica would + * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as + * to complete failover as quickly as possible. */ + clusterSendPing(link, CLUSTERMSG_TYPE_PING); + } else if (type == CLUSTERMSG_TYPE_UPDATE) { + clusterNode *n; /* The node the update is about. */ + uint64_t reportedConfigEpoch = + ntohu64(hdr->data.update.nodecfg.configEpoch); + + if (!sender) return 1; /* We don't know the sender. */ + n = clusterLookupNode(hdr->data.update.nodecfg.nodename, CLUSTER_NAMELEN); + if (!n) return 1; /* We don't know the reported node. */ + if (n->configEpoch >= reportedConfigEpoch) return 1; /* Nothing new. */ + + /* If in our current config the node is a slave, set it as a master. */ + if (nodeIsSlave(n)) clusterSetNodeAsMaster(n); + + /* Update the node's configEpoch. */ + n->configEpoch = reportedConfigEpoch; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_FSYNC_CONFIG); + + /* Check the bitmap of served slots and update our + * config accordingly. */ + clusterUpdateSlotsConfigWith(n,reportedConfigEpoch, + hdr->data.update.nodecfg.slots); + } else if (type == CLUSTERMSG_TYPE_MODULE) { + if (!sender) return 1; /* Protect the module from unknown nodes. */ + /* We need to route this message back to the right module subscribed + * for the right message type. */ + uint64_t module_id = hdr->data.module.msg.module_id; /* Endian-safe ID */ + uint32_t len = ntohl(hdr->data.module.msg.len); + uint8_t type = hdr->data.module.msg.type; + unsigned char *payload = hdr->data.module.msg.bulk_data; + moduleCallClusterReceivers(sender->name,module_id,type,payload,len); + } else { + serverLog(LL_WARNING,"Received unknown packet type: %d", type); + } + return 1; +} + +/* This function is called when we detect the link with this node is lost. + We set the node as no longer connected. The Cluster Cron will detect + this connection and will try to get it connected again. + + Instead if the node is a temporary node used to accept a query, we + completely free the node on error. */ +void handleLinkIOError(clusterLink *link) { + freeClusterLink(link); +} + +/* Send the messages queued for the link. */ +void clusterWriteHandler(connection *conn) { + clusterLink *link = connGetPrivateData(conn); + ssize_t nwritten; + size_t totwritten = 0; + + while (totwritten < NET_MAX_WRITES_PER_EVENT && listLength(link->send_msg_queue) > 0) { + listNode *head = listFirst(link->send_msg_queue); + clusterMsgSendBlock *msgblock = (clusterMsgSendBlock*)head->value; + clusterMsg *msg = &msgblock->msg; + size_t msg_offset = link->head_msg_send_offset; + size_t msg_len = ntohl(msg->totlen); + + nwritten = connWrite(conn, (char*)msg + msg_offset, msg_len - msg_offset); + if (nwritten <= 0) { + serverLog(LL_DEBUG,"I/O error writing to node link: %s", + (nwritten == -1) ? connGetLastError(conn) : "short write"); + handleLinkIOError(link); + return; + } + if (msg_offset + nwritten < msg_len) { + /* If full message wasn't written, record the offset + * and continue sending from this point next time */ + link->head_msg_send_offset += nwritten; + return; + } + serverAssert((msg_offset + nwritten) == msg_len); + link->head_msg_send_offset = 0; + + /* Delete the node and update our memory tracking */ + uint32_t blocklen = msgblock->totlen; + listDelNode(link->send_msg_queue, head); + server.stat_cluster_links_memory -= sizeof(listNode); + link->send_msg_queue_mem -= sizeof(listNode) + blocklen; + + totwritten += nwritten; + } + + if (listLength(link->send_msg_queue) == 0) + connSetWriteHandler(link->conn, NULL); +} + +/* A connect handler that gets called when a connection to another node + * gets established. + */ +void clusterLinkConnectHandler(connection *conn) { + clusterLink *link = connGetPrivateData(conn); + clusterNode *node = link->node; + + /* Check if connection succeeded */ + if (connGetState(conn) != CONN_STATE_CONNECTED) { + serverLog(LL_VERBOSE, "Connection with Node %.40s at %s:%d failed: %s", + node->name, node->ip, node->cport, + connGetLastError(conn)); + freeClusterLink(link); + return; + } + + /* Register a read handler from now on */ + connSetReadHandler(conn, clusterReadHandler); + + /* Queue a PING in the new connection ASAP: this is crucial + * to avoid false positives in failure detection. + * + * If the node is flagged as MEET, we send a MEET message instead + * of a PING one, to force the receiver to add us in its node + * table. */ + mstime_t old_ping_sent = node->ping_sent; + clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ? + CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); + if (old_ping_sent) { + /* If there was an active ping before the link was + * disconnected, we want to restore the ping time, otherwise + * replaced by the clusterSendPing() call. */ + node->ping_sent = old_ping_sent; + } + /* We can clear the flag after the first packet is sent. + * If we'll never receive a PONG, we'll never send new packets + * to this node. Instead after the PONG is received and we + * are no longer in meet/handshake status, we want to send + * normal PING packets. */ + node->flags &= ~CLUSTER_NODE_MEET; + + serverLog(LL_DEBUG,"Connecting with Node %.40s at %s:%d", + node->name, node->ip, node->cport); +} + +/* Read data. Try to read the first field of the header first to check the + * full length of the packet. When a whole packet is in memory this function + * will call the function to process the packet. And so forth. */ +void clusterReadHandler(connection *conn) { + clusterMsg buf[1]; + ssize_t nread; + clusterMsg *hdr; + clusterLink *link = connGetPrivateData(conn); + unsigned int readlen, rcvbuflen; + + while(1) { /* Read as long as there is data to read. */ + rcvbuflen = link->rcvbuf_len; + if (rcvbuflen < 8) { + /* First, obtain the first 8 bytes to get the full message + * length. */ + readlen = 8 - rcvbuflen; + } else { + /* Finally read the full message. */ + hdr = (clusterMsg*) link->rcvbuf; + if (rcvbuflen == 8) { + /* Perform some sanity check on the message signature + * and length. */ + if (memcmp(hdr->sig,"RCmb",4) != 0 || + ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN) + { + char ip[NET_IP_STR_LEN]; + int port; + if (connAddrPeerName(conn, ip, sizeof(ip), &port) == -1) { + serverLog(LL_WARNING, + "Bad message length or signature received " + "on the Cluster bus."); + } else { + serverLog(LL_WARNING, + "Bad message length or signature received " + "on the Cluster bus from %s:%d", ip, port); + } + handleLinkIOError(link); + return; + } + } + readlen = ntohl(hdr->totlen) - rcvbuflen; + if (readlen > sizeof(buf)) readlen = sizeof(buf); + } + + nread = connRead(conn,buf,readlen); + if (nread == -1 && (connGetState(conn) == CONN_STATE_CONNECTED)) return; /* No more data ready. */ + + if (nread <= 0) { + /* I/O error... */ + serverLog(LL_DEBUG,"I/O error reading from node link: %s", + (nread == 0) ? "connection closed" : connGetLastError(conn)); + handleLinkIOError(link); + return; + } else { + /* Read data and recast the pointer to the new buffer. */ + size_t unused = link->rcvbuf_alloc - link->rcvbuf_len; + if ((size_t)nread > unused) { + size_t required = link->rcvbuf_len + nread; + size_t prev_rcvbuf_alloc = link->rcvbuf_alloc; + /* If less than 1mb, grow to twice the needed size, if larger grow by 1mb. */ + link->rcvbuf_alloc = required < RCVBUF_MAX_PREALLOC ? required * 2: required + RCVBUF_MAX_PREALLOC; + link->rcvbuf = zrealloc(link->rcvbuf, link->rcvbuf_alloc); + server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc; + } + memcpy(link->rcvbuf + link->rcvbuf_len, buf, nread); + link->rcvbuf_len += nread; + hdr = (clusterMsg*) link->rcvbuf; + rcvbuflen += nread; + } + + /* Total length obtained? Process this packet. */ + if (rcvbuflen >= 8 && rcvbuflen == ntohl(hdr->totlen)) { + if (clusterProcessPacket(link)) { + if (link->rcvbuf_alloc > RCVBUF_INIT_LEN) { + size_t prev_rcvbuf_alloc = link->rcvbuf_alloc; + zfree(link->rcvbuf); + link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN); + server.stat_cluster_links_memory += link->rcvbuf_alloc - prev_rcvbuf_alloc; + } + link->rcvbuf_len = 0; + } else { + return; /* Link no longer valid. */ + } + } + } +} + +/* Put the message block into the link's send queue. + * + * It is guaranteed that this function will never have as a side effect + * the link to be invalidated, so it is safe to call this function + * from event handlers that will do stuff with the same link later. */ +void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) { + if (!link) { + return; + } + if (listLength(link->send_msg_queue) == 0 && msgblock->msg.totlen != 0) + connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1); + + listAddNodeTail(link->send_msg_queue, msgblock); + msgblock->refcount++; + + /* Update memory tracking */ + link->send_msg_queue_mem += sizeof(listNode) + msgblock->totlen; + server.stat_cluster_links_memory += sizeof(listNode); + + /* Populate sent messages stats. */ + uint16_t type = ntohs(msgblock->msg.type); + if (type < CLUSTERMSG_TYPE_COUNT) + server.cluster->stats_bus_messages_sent[type]++; +} + +/* Send a message to all the nodes that are part of the cluster having + * a connected link. + * + * It is guaranteed that this function will never have as a side effect + * some node->link to be invalidated, so it is safe to call this function + * from event handlers that will do stuff with node links later. */ +void clusterBroadcastMessage(clusterMsgSendBlock *msgblock) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) + continue; + clusterSendMessage(node->link,msgblock); + } + dictReleaseIterator(di); +} + +/* Build the message header. hdr must point to a buffer at least + * sizeof(clusterMsg) in bytes. */ +static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen) { + uint64_t offset; + clusterNode *master; + + /* If this node is a master, we send its slots bitmap and configEpoch. + * If this node is a slave we send the master's information instead (the + * node is flagged as slave so the receiver knows that it is NOT really + * in charge for this slots. */ + master = (nodeIsSlave(myself) && myself->slaveof) ? + myself->slaveof : myself; + + hdr->ver = htons(CLUSTER_PROTO_VER); + hdr->sig[0] = 'R'; + hdr->sig[1] = 'C'; + hdr->sig[2] = 'm'; + hdr->sig[3] = 'b'; + hdr->type = htons(type); + memcpy(hdr->sender,myself->name,CLUSTER_NAMELEN); + + /* If cluster-announce-ip option is enabled, force the receivers of our + * packets to use the specified address for this node. Otherwise if the + * first byte is zero, they'll do auto discovery. */ + memset(hdr->myip,0,NET_IP_STR_LEN); + if (server.cluster_announce_ip) { + redis_strlcpy(hdr->myip,server.cluster_announce_ip,NET_IP_STR_LEN); + } + + /* Handle cluster-announce-[tls-|bus-]port. */ + int announced_tcp_port, announced_tls_port, announced_cport; + deriveAnnouncedPorts(&announced_tcp_port, &announced_tls_port, &announced_cport); + + memcpy(hdr->myslots,master->slots,sizeof(hdr->myslots)); + memset(hdr->slaveof,0,CLUSTER_NAMELEN); + if (myself->slaveof != NULL) + memcpy(hdr->slaveof,myself->slaveof->name, CLUSTER_NAMELEN); + if (server.tls_cluster) { + hdr->port = htons(announced_tls_port); + hdr->pport = htons(announced_tcp_port); + } else { + hdr->port = htons(announced_tcp_port); + hdr->pport = htons(announced_tls_port); + } + hdr->cport = htons(announced_cport); + hdr->flags = htons(myself->flags); + hdr->state = server.cluster->state; + + /* Set the currentEpoch and configEpochs. */ + hdr->currentEpoch = htonu64(server.cluster->currentEpoch); + hdr->configEpoch = htonu64(master->configEpoch); + + /* Set the replication offset. */ + if (nodeIsSlave(myself)) + offset = replicationGetSlaveOffset(); + else + offset = server.master_repl_offset; + hdr->offset = htonu64(offset); + + /* Set the message flags. */ + if (clusterNodeIsMaster(myself) && server.cluster->mf_end) + hdr->mflags[0] |= CLUSTERMSG_FLAG0_PAUSED; + + hdr->totlen = htonl(msglen); +} + +/* Set the i-th entry of the gossip section in the message pointed by 'hdr' + * to the info of the specified node 'n'. */ +void clusterSetGossipEntry(clusterMsg *hdr, int i, clusterNode *n) { + clusterMsgDataGossip *gossip; + gossip = &(hdr->data.ping.gossip[i]); + memcpy(gossip->nodename,n->name,CLUSTER_NAMELEN); + gossip->ping_sent = htonl(n->ping_sent/1000); + gossip->pong_received = htonl(n->pong_received/1000); + memcpy(gossip->ip,n->ip,sizeof(n->ip)); + if (server.tls_cluster) { + gossip->port = htons(n->tls_port); + gossip->pport = htons(n->tcp_port); + } else { + gossip->port = htons(n->tcp_port); + gossip->pport = htons(n->tls_port); + } + gossip->cport = htons(n->cport); + gossip->flags = htons(n->flags); + gossip->notused1 = 0; +} + +/* Send a PING or PONG packet to the specified node, making sure to add enough + * gossip information. */ +void clusterSendPing(clusterLink *link, int type) { + static unsigned long long cluster_pings_sent = 0; + cluster_pings_sent++; + int gossipcount = 0; /* Number of gossip sections added so far. */ + int wanted; /* Number of gossip sections we want to append if possible. */ + int estlen; /* Upper bound on estimated packet length */ + /* freshnodes is the max number of nodes we can hope to append at all: + * nodes available minus two (ourself and the node we are sending the + * message to). However practically there may be less valid nodes since + * nodes in handshake state, disconnected, are not considered. */ + int freshnodes = dictSize(server.cluster->nodes)-2; + + /* How many gossip sections we want to add? 1/10 of the number of nodes + * and anyway at least 3. Why 1/10? + * + * If we have N masters, with N/10 entries, and we consider that in + * node_timeout we exchange with each other node at least 4 packets + * (we ping in the worst case in node_timeout/2 time, and we also + * receive two pings from the host), we have a total of 8 packets + * in the node_timeout*2 failure reports validity time. So we have + * that, for a single PFAIL node, we can expect to receive the following + * number of failure reports (in the specified window of time): + * + * PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS: + * + * PROB = probability of being featured in a single gossip entry, + * which is 1 / NUM_OF_NODES. + * ENTRIES = 10. + * TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS. + * + * If we assume we have just masters (so num of nodes and num of masters + * is the same), with 1/10 we always get over the majority, and specifically + * 80% of the number of nodes, to account for many masters failing at the + * same time. + * + * Since we have non-voting slaves that lower the probability of an entry + * to feature our node, we set the number of entries per packet as + * 10% of the total nodes we have. */ + wanted = floor(dictSize(server.cluster->nodes)/10); + if (wanted < 3) wanted = 3; + if (wanted > freshnodes) wanted = freshnodes; + + /* Include all the nodes in PFAIL state, so that failure reports are + * faster to propagate to go from PFAIL to FAIL state. */ + int pfail_wanted = server.cluster->stats_pfail_nodes; + + /* Compute the maximum estlen to allocate our buffer. We'll fix the estlen + * later according to the number of gossip sections we really were able + * to put inside the packet. */ + estlen = sizeof(clusterMsg) - sizeof(union clusterMsgData); + estlen += (sizeof(clusterMsgDataGossip)*(wanted + pfail_wanted)); + estlen += writePingExt(NULL, 0); + /* Note: clusterBuildMessageHdr() expects the buffer to be always at least + * sizeof(clusterMsg) or more. */ + if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen); + clusterMsg *hdr = &msgblock->msg; + + if (!link->inbound && type == CLUSTERMSG_TYPE_PING) + link->node->ping_sent = mstime(); + + /* Populate the gossip fields */ + int maxiterations = wanted*3; + while(freshnodes > 0 && gossipcount < wanted && maxiterations--) { + dictEntry *de = dictGetRandomKey(server.cluster->nodes); + clusterNode *this = dictGetVal(de); + + /* Don't include this node: the whole packet header is about us + * already, so we just gossip about other nodes. + * Also, don't include the receiver. Receiver will not update its state + * based on gossips about itself. */ + if (this == myself || this == link->node) continue; + + /* PFAIL nodes will be added later. */ + if (this->flags & CLUSTER_NODE_PFAIL) continue; + + /* In the gossip section don't include: + * 1) Nodes in HANDSHAKE state. + * 3) Nodes with the NOADDR flag set. + * 4) Disconnected nodes if they don't have configured slots. + */ + if (this->flags & (CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_NOADDR) || + (this->link == NULL && this->numslots == 0)) + { + freshnodes--; /* Technically not correct, but saves CPU. */ + continue; + } + + /* Do not add a node we already have. */ + if (this->last_in_ping_gossip == cluster_pings_sent) continue; + + /* Add it */ + clusterSetGossipEntry(hdr,gossipcount,this); + this->last_in_ping_gossip = cluster_pings_sent; + freshnodes--; + gossipcount++; + } + + /* If there are PFAIL nodes, add them at the end. */ + if (pfail_wanted) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL && pfail_wanted > 0) { + clusterNode *node = dictGetVal(de); + if (node->flags & CLUSTER_NODE_HANDSHAKE) continue; + if (node->flags & CLUSTER_NODE_NOADDR) continue; + if (!(node->flags & CLUSTER_NODE_PFAIL)) continue; + clusterSetGossipEntry(hdr,gossipcount,node); + gossipcount++; + /* We take the count of the slots we allocated, since the + * PFAIL stats may not match perfectly with the current number + * of PFAIL nodes. */ + pfail_wanted--; + } + dictReleaseIterator(di); + } + + /* Compute the actual total length and send! */ + uint32_t totlen = 0; + totlen += writePingExt(hdr, gossipcount); + totlen += sizeof(clusterMsg)-sizeof(union clusterMsgData); + totlen += (sizeof(clusterMsgDataGossip)*gossipcount); + serverAssert(gossipcount < USHRT_MAX); + hdr->count = htons(gossipcount); + hdr->totlen = htonl(totlen); + + clusterSendMessage(link,msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Send a PONG packet to every connected node that's not in handshake state + * and for which we have a valid link. + * + * In Redis Cluster pongs are not used just for failure detection, but also + * to carry important configuration information. So broadcasting a pong is + * useful when something changes in the configuration and we want to make + * the cluster aware ASAP (for instance after a slave promotion). + * + * The 'target' argument specifies the receiving instances using the + * defines below: + * + * CLUSTER_BROADCAST_ALL -> All known instances. + * CLUSTER_BROADCAST_LOCAL_SLAVES -> All slaves in my master-slaves ring. + */ +#define CLUSTER_BROADCAST_ALL 0 +#define CLUSTER_BROADCAST_LOCAL_SLAVES 1 +void clusterBroadcastPong(int target) { + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (!node->link) continue; + if (node == myself || nodeInHandshake(node)) continue; + if (target == CLUSTER_BROADCAST_LOCAL_SLAVES) { + int local_slave = + nodeIsSlave(node) && node->slaveof && + (node->slaveof == myself || node->slaveof == myself->slaveof); + if (!local_slave) continue; + } + clusterSendPing(node->link,CLUSTERMSG_TYPE_PONG); + } + dictReleaseIterator(di); +} + +/* Create a PUBLISH message block. + * + * Sanitizer suppression: In clusterMsgDataPublish, sizeof(bulk_data) is 8. + * As all the struct is used as a buffer, when more than 8 bytes are copied into + * the 'bulk_data', sanitizer generates an out-of-bounds error which is a false + * positive in this context. */ +REDIS_NO_SANITIZE("bounds") +clusterMsgSendBlock *clusterCreatePublishMsgBlock(robj *channel, robj *message, uint16_t type) { + + uint32_t channel_len, message_len; + + channel = getDecodedObject(channel); + message = getDecodedObject(message); + channel_len = sdslen(channel->ptr); + message_len = sdslen(message->ptr); + + size_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + msglen += sizeof(clusterMsgDataPublish) - 8 + channel_len + message_len; + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, msglen); + + clusterMsg *hdr = &msgblock->msg; + hdr->data.publish.msg.channel_len = htonl(channel_len); + hdr->data.publish.msg.message_len = htonl(message_len); + memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr)); + memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr), + message->ptr,sdslen(message->ptr)); + + decrRefCount(channel); + decrRefCount(message); + + return msgblock; +} + +/* Send a FAIL message to all the nodes we are able to contact. + * The FAIL message is sent when we detect that a node is failing + * (CLUSTER_NODE_PFAIL) and we also receive a gossip confirmation of this: + * we switch the node state to CLUSTER_NODE_FAIL and ask all the other + * nodes to do the same ASAP. */ +void clusterSendFail(char *nodename) { + uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + + sizeof(clusterMsgDataFail); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAIL, msglen); + + clusterMsg *hdr = &msgblock->msg; + memcpy(hdr->data.fail.about.nodename,nodename,CLUSTER_NAMELEN); + + clusterBroadcastMessage(msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Send an UPDATE message to the specified link carrying the specified 'node' + * slots configuration. The node name, slots bitmap, and configEpoch info + * are included. */ +void clusterSendUpdate(clusterLink *link, clusterNode *node) { + if (link == NULL) return; + + uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + + sizeof(clusterMsgDataUpdate); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_UPDATE, msglen); + + clusterMsg *hdr = &msgblock->msg; + memcpy(hdr->data.update.nodecfg.nodename,node->name,CLUSTER_NAMELEN); + hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch); + memcpy(hdr->data.update.nodecfg.slots,node->slots,sizeof(node->slots)); + for (unsigned int i = 0; i < sizeof(node->slots); i++) { + /* Don't advertise slots that the node stopped claiming */ + hdr->data.update.nodecfg.slots[i] = hdr->data.update.nodecfg.slots[i] & (~server.cluster->owner_not_claiming_slot[i]); + } + + clusterSendMessage(link,msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Send a MODULE message. + * + * If link is NULL, then the message is broadcasted to the whole cluster. */ +void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type, + const char *payload, uint32_t len) { + uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + msglen += sizeof(clusterMsgModule) - 3 + len; + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MODULE, msglen); + + clusterMsg *hdr = &msgblock->msg; + hdr->data.module.msg.module_id = module_id; /* Already endian adjusted. */ + hdr->data.module.msg.type = type; + hdr->data.module.msg.len = htonl(len); + memcpy(hdr->data.module.msg.bulk_data,payload,len); + + if (link) + clusterSendMessage(link,msgblock); + else + clusterBroadcastMessage(msgblock); + + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* This function gets a cluster node ID string as target, the same way the nodes + * addresses are represented in the modules side, resolves the node, and sends + * the message. If the target is NULL the message is broadcasted. + * + * The function returns C_OK if the target is valid, otherwise C_ERR is + * returned. */ +int clusterSendModuleMessageToTarget(const char *target, uint64_t module_id, uint8_t type, const char *payload, uint32_t len) { + clusterNode *node = NULL; + + if (target != NULL) { + node = clusterLookupNode(target, strlen(target)); + if (node == NULL || node->link == NULL) return C_ERR; + } + + clusterSendModule(target ? node->link : NULL, + module_id, type, payload, len); + return C_OK; +} + +/* ----------------------------------------------------------------------------- + * CLUSTER Pub/Sub support + * + * If `sharded` is 0: + * For now we do very little, just propagating [S]PUBLISH messages across the whole + * cluster. In the future we'll try to get smarter and avoiding propagating those + * messages to hosts without receives for a given channel. + * Otherwise: + * Publish this message across the slot (primary/replica). + * -------------------------------------------------------------------------- */ +void clusterPropagatePublish(robj *channel, robj *message, int sharded) { + clusterMsgSendBlock *msgblock; + + if (!sharded) { + msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISH); + clusterBroadcastMessage(msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); + return; + } + + listIter li; + listNode *ln; + list *nodes_for_slot = clusterGetNodesInMyShard(server.cluster->myself); + serverAssert(nodes_for_slot != NULL); + listRewind(nodes_for_slot, &li); + msgblock = clusterCreatePublishMsgBlock(channel, message, CLUSTERMSG_TYPE_PUBLISHSHARD); + while((ln = listNext(&li))) { + clusterNode *node = listNodeValue(ln); + if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) + continue; + clusterSendMessage(node->link,msgblock); + } + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* ----------------------------------------------------------------------------- + * SLAVE node specific functions + * -------------------------------------------------------------------------- */ + +/* This function sends a FAILOVER_AUTH_REQUEST message to every node in order to + * see if there is the quorum for this slave instance to failover its failing + * master. + * + * Note that we send the failover request to everybody, master and slave nodes, + * but only the masters are supposed to reply to our query. */ +void clusterRequestFailoverAuth(void) { + uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST, msglen); + + clusterMsg *hdr = &msgblock->msg; + /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit + * in the header to communicate the nodes receiving the message that + * they should authorized the failover even if the master is working. */ + if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; + clusterBroadcastMessage(msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Send a FAILOVER_AUTH_ACK message to the specified node. */ +void clusterSendFailoverAuth(clusterNode *node) { + if (!node->link) return; + + uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK, msglen); + + clusterSendMessage(node->link,msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Send a MFSTART message to the specified node. */ +void clusterSendMFStart(clusterNode *node) { + if (!node->link) return; + + uint32_t msglen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MFSTART, msglen); + + clusterSendMessage(node->link,msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* Vote for the node asking for our vote if there are the conditions. */ +void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { + clusterNode *master = node->slaveof; + uint64_t requestCurrentEpoch = ntohu64(request->currentEpoch); + uint64_t requestConfigEpoch = ntohu64(request->configEpoch); + unsigned char *claimed_slots = request->myslots; + int force_ack = request->mflags[0] & CLUSTERMSG_FLAG0_FORCEACK; + int j; + + /* IF we are not a master serving at least 1 slot, we don't have the + * right to vote, as the cluster size in Redis Cluster is the number + * of masters serving at least one slot, and quorum is the cluster + * size + 1 */ + if (nodeIsSlave(myself) || myself->numslots == 0) return; + + /* Request epoch must be >= our currentEpoch. + * Note that it is impossible for it to actually be greater since + * our currentEpoch was updated as a side effect of receiving this + * request, if the request epoch was greater. */ + if (requestCurrentEpoch < server.cluster->currentEpoch) { + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): reqEpoch (%llu) < curEpoch(%llu)", + node->name, node->human_nodename, + (unsigned long long) requestCurrentEpoch, + (unsigned long long) server.cluster->currentEpoch); + return; + } + + /* I already voted for this epoch? Return ASAP. */ + if (server.cluster->lastVoteEpoch == server.cluster->currentEpoch) { + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): already voted for epoch %llu", + node->name, node->human_nodename, + (unsigned long long) server.cluster->currentEpoch); + return; + } + + /* Node must be a slave and its master down. + * The master can be non failing if the request is flagged + * with CLUSTERMSG_FLAG0_FORCEACK (manual failover). */ + if (clusterNodeIsMaster(node) || master == NULL || + (!nodeFailed(master) && !force_ack)) + { + if (clusterNodeIsMaster(node)) { + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): it is a master node", + node->name, node->human_nodename); + } else if (master == NULL) { + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): I don't know its master", + node->name, node->human_nodename); + } else if (!nodeFailed(master)) { + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): its master is up", + node->name, node->human_nodename); + } + return; + } + + /* We did not voted for a slave about this master for two + * times the node timeout. This is not strictly needed for correctness + * of the algorithm but makes the base case more linear. */ + if (mstime() - node->slaveof->voted_time < server.cluster_node_timeout * 2) + { + serverLog(LL_WARNING, + "Failover auth denied to %.40s %s: " + "can't vote about this master before %lld milliseconds", + node->name, node->human_nodename, + (long long) ((server.cluster_node_timeout*2)- + (mstime() - node->slaveof->voted_time))); + return; + } + + /* The slave requesting the vote must have a configEpoch for the claimed + * slots that is >= the one of the masters currently serving the same + * slots in the current configuration. */ + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (bitmapTestBit(claimed_slots, j) == 0) continue; + if (isSlotUnclaimed(j) || + server.cluster->slots[j]->configEpoch <= requestConfigEpoch) + { + continue; + } + /* If we reached this point we found a slot that in our current slots + * is served by a master with a greater configEpoch than the one claimed + * by the slave requesting our vote. Refuse to vote for this slave. */ + serverLog(LL_WARNING, + "Failover auth denied to %.40s (%s): " + "slot %d epoch (%llu) > reqEpoch (%llu)", + node->name, node->human_nodename, j, + (unsigned long long) server.cluster->slots[j]->configEpoch, + (unsigned long long) requestConfigEpoch); + return; + } + + /* We can vote for this slave. */ + server.cluster->lastVoteEpoch = server.cluster->currentEpoch; + node->slaveof->voted_time = mstime(); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG); + clusterSendFailoverAuth(node); + serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", + node->name, node->human_nodename, (unsigned long long) server.cluster->currentEpoch); +} + +/* This function returns the "rank" of this instance, a slave, in the context + * of its master-slaves ring. The rank of the slave is given by the number of + * other slaves for the same master that have a better replication offset + * compared to the local one (better means, greater, so they claim more data). + * + * A slave with rank 0 is the one with the greatest (most up to date) + * replication offset, and so forth. Note that because how the rank is computed + * multiple slaves may have the same rank, in case they have the same offset. + * + * The slave rank is used to add a delay to start an election in order to + * get voted and replace a failing master. Slaves with better replication + * offsets are more likely to win. */ +int clusterGetSlaveRank(void) { + long long myoffset; + int j, rank = 0; + clusterNode *master; + + serverAssert(nodeIsSlave(myself)); + master = myself->slaveof; + if (master == NULL) return 0; /* Never called by slaves without master. */ + + myoffset = replicationGetSlaveOffset(); + for (j = 0; j < master->numslaves; j++) + if (master->slaves[j] != myself && + !nodeCantFailover(master->slaves[j]) && + master->slaves[j]->repl_offset > myoffset) rank++; + return rank; +} + +/* This function is called by clusterHandleSlaveFailover() in order to + * let the slave log why it is not able to failover. Sometimes there are + * not the conditions, but since the failover function is called again and + * again, we can't log the same things continuously. + * + * This function works by logging only if a given set of conditions are + * true: + * + * 1) The reason for which the failover can't be initiated changed. + * The reasons also include a NONE reason we reset the state to + * when the slave finds that its master is fine (no FAIL flag). + * 2) Also, the log is emitted again if the master is still down and + * the reason for not failing over is still the same, but more than + * CLUSTER_CANT_FAILOVER_RELOG_PERIOD seconds elapsed. + * 3) Finally, the function only logs if the slave is down for more than + * five seconds + NODE_TIMEOUT. This way nothing is logged when a + * failover starts in a reasonable time. + * + * The function is called with the reason why the slave can't failover + * which is one of the integer macros CLUSTER_CANT_FAILOVER_*. + * + * The function is guaranteed to be called only if 'myself' is a slave. */ +void clusterLogCantFailover(int reason) { + char *msg; + static time_t lastlog_time = 0; + mstime_t nolog_fail_time = server.cluster_node_timeout + 5000; + + /* Don't log if we have the same reason for some time. */ + if (reason == server.cluster->cant_failover_reason && + time(NULL)-lastlog_time < CLUSTER_CANT_FAILOVER_RELOG_PERIOD) + return; + + server.cluster->cant_failover_reason = reason; + + /* We also don't emit any log if the master failed no long ago, the + * goal of this function is to log slaves in a stalled condition for + * a long time. */ + if (myself->slaveof && + nodeFailed(myself->slaveof) && + (mstime() - myself->slaveof->fail_time) < nolog_fail_time) return; + + switch(reason) { + case CLUSTER_CANT_FAILOVER_DATA_AGE: + msg = "Disconnected from master for longer than allowed. " + "Please check the 'cluster-replica-validity-factor' configuration " + "option."; + break; + case CLUSTER_CANT_FAILOVER_WAITING_DELAY: + msg = "Waiting the delay before I can start a new failover."; + break; + case CLUSTER_CANT_FAILOVER_EXPIRED: + msg = "Failover attempt expired."; + break; + case CLUSTER_CANT_FAILOVER_WAITING_VOTES: + msg = "Waiting for votes, but majority still not reached."; + break; + default: + msg = "Unknown reason code."; + break; + } + lastlog_time = time(NULL); + serverLog(LL_NOTICE,"Currently unable to failover: %s", msg); + + int cur_vote = server.cluster->failover_auth_count; + int cur_quorum = (server.cluster->size / 2) + 1; + /* Emits a log when an election is in progress and waiting for votes or when the failover attempt expired. */ + if (reason == CLUSTER_CANT_FAILOVER_WAITING_VOTES || reason == CLUSTER_CANT_FAILOVER_EXPIRED) { + serverLog(LL_NOTICE, "Needed quorum: %d. Number of votes received so far: %d", cur_quorum, cur_vote); + } +} + +/* This function implements the final part of automatic and manual failovers, + * where the slave grabs its master's hash slots, and propagates the new + * configuration. + * + * Note that it's up to the caller to be sure that the node got a new + * configuration epoch already. */ +void clusterFailoverReplaceYourMaster(void) { + int j; + clusterNode *oldmaster = myself->slaveof; + + if (clusterNodeIsMaster(myself) || oldmaster == NULL) return; + + /* 1) Turn this node into a master. */ + clusterSetNodeAsMaster(myself); + replicationUnsetMaster(); + + /* 2) Claim all the slots assigned to our master. */ + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (clusterNodeCoversSlot(oldmaster, j)) { + clusterDelSlot(j); + clusterAddSlot(myself,j); + } + } + + /* 3) Update state and save config. */ + clusterUpdateState(); + clusterSaveConfigOrDie(1); + + /* 4) Pong all the other nodes so that they can update the state + * accordingly and detect that we switched to master role. */ + clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + + /* 5) If there was a manual failover in progress, clear the state. */ + resetManualFailover(); +} + +/* This function is called if we are a slave node and our master serving + * a non-zero amount of hash slots is in FAIL state. + * + * The goal of this function is: + * 1) To check if we are able to perform a failover, is our data updated? + * 2) Try to get elected by masters. + * 3) Perform the failover informing all the other nodes. + */ +void clusterHandleSlaveFailover(void) { + mstime_t data_age; + mstime_t auth_age = mstime() - server.cluster->failover_auth_time; + int needed_quorum = (server.cluster->size / 2) + 1; + int manual_failover = server.cluster->mf_end != 0 && + server.cluster->mf_can_start; + mstime_t auth_timeout, auth_retry_time; + + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_FAILOVER; + + /* Compute the failover timeout (the max time we have to send votes + * and wait for replies), and the failover retry time (the time to wait + * before trying to get voted again). + * + * Timeout is MAX(NODE_TIMEOUT*2,2000) milliseconds. + * Retry is two times the Timeout. + */ + auth_timeout = server.cluster_node_timeout*2; + if (auth_timeout < 2000) auth_timeout = 2000; + auth_retry_time = auth_timeout*2; + + /* Pre conditions to run the function, that must be met both in case + * of an automatic or manual failover: + * 1) We are a slave. + * 2) Our master is flagged as FAIL, or this is a manual failover. + * 3) We don't have the no failover configuration set, and this is + * not a manual failover. + * 4) It is serving slots. */ + if (clusterNodeIsMaster(myself) || + myself->slaveof == NULL || + (!nodeFailed(myself->slaveof) && !manual_failover) || + (server.cluster_slave_no_failover && !manual_failover) || + myself->slaveof->numslots == 0) + { + /* There are no reasons to failover, so we set the reason why we + * are returning without failing over to NONE. */ + server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; + return; + } + + /* Set data_age to the number of milliseconds we are disconnected from + * the master. */ + if (server.repl_state == REPL_STATE_CONNECTED) { + data_age = (mstime_t)(server.unixtime - server.master->lastinteraction) + * 1000; + } else { + data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000; + } + + /* Remove the node timeout from the data age as it is fine that we are + * disconnected from our master at least for the time it was down to be + * flagged as FAIL, that's the baseline. */ + if (data_age > server.cluster_node_timeout) + data_age -= server.cluster_node_timeout; + + /* Check if our data is recent enough according to the slave validity + * factor configured by the user. + * + * Check bypassed for manual failovers. */ + if (server.cluster_slave_validity_factor && + data_age > + (((mstime_t)server.repl_ping_slave_period * 1000) + + (server.cluster_node_timeout * server.cluster_slave_validity_factor))) + { + if (!manual_failover) { + clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE); + return; + } + } + + /* If the previous failover attempt timeout and the retry time has + * elapsed, we can setup a new one. */ + if (auth_age > auth_retry_time) { + server.cluster->failover_auth_time = mstime() + + 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */ + random() % 500; /* Random delay between 0 and 500 milliseconds. */ + server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_sent = 0; + server.cluster->failover_auth_rank = clusterGetSlaveRank(); + /* We add another delay that is proportional to the slave rank. + * Specifically 1 second * rank. This way slaves that have a probably + * less updated replication offset, are penalized. */ + server.cluster->failover_auth_time += + server.cluster->failover_auth_rank * 1000; + /* However if this is a manual failover, no delay is needed. */ + if (server.cluster->mf_end) { + server.cluster->failover_auth_time = mstime(); + server.cluster->failover_auth_rank = 0; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + } + serverLog(LL_NOTICE, + "Start of election delayed for %lld milliseconds " + "(rank #%d, offset %lld).", + server.cluster->failover_auth_time - mstime(), + server.cluster->failover_auth_rank, + replicationGetSlaveOffset()); + /* Now that we have a scheduled election, broadcast our offset + * to all the other slaves so that they'll updated their offsets + * if our offset is better. */ + clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_SLAVES); + return; + } + + /* It is possible that we received more updated offsets from other + * slaves for the same master since we computed our election delay. + * Update the delay if our rank changed. + * + * Not performed if this is a manual failover. */ + if (server.cluster->failover_auth_sent == 0 && + server.cluster->mf_end == 0) + { + int newrank = clusterGetSlaveRank(); + if (newrank > server.cluster->failover_auth_rank) { + long long added_delay = + (newrank - server.cluster->failover_auth_rank) * 1000; + server.cluster->failover_auth_time += added_delay; + server.cluster->failover_auth_rank = newrank; + serverLog(LL_NOTICE, + "Replica rank updated to #%d, added %lld milliseconds of delay.", + newrank, added_delay); + } + } + + /* Return ASAP if we can't still start the election. */ + if (mstime() < server.cluster->failover_auth_time) { + clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY); + return; + } + + /* Return ASAP if the election is too old to be valid. */ + if (auth_age > auth_timeout) { + clusterLogCantFailover(CLUSTER_CANT_FAILOVER_EXPIRED); + return; + } + + /* Ask for votes if needed. */ + if (server.cluster->failover_auth_sent == 0) { + server.cluster->currentEpoch++; + server.cluster->failover_auth_epoch = server.cluster->currentEpoch; + serverLog(LL_NOTICE,"Starting a failover election for epoch %llu.", + (unsigned long long) server.cluster->currentEpoch); + clusterRequestFailoverAuth(); + server.cluster->failover_auth_sent = 1; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); + return; /* Wait for replies. */ + } + + /* Check if we reached the quorum. */ + if (server.cluster->failover_auth_count >= needed_quorum) { + /* We have the quorum, we can finally failover the master. */ + + serverLog(LL_NOTICE, + "Failover election won: I'm the new master."); + + /* Update my configEpoch to the epoch of the election. */ + if (myself->configEpoch < server.cluster->failover_auth_epoch) { + myself->configEpoch = server.cluster->failover_auth_epoch; + serverLog(LL_NOTICE, + "configEpoch set to %llu after successful failover", + (unsigned long long) myself->configEpoch); + } + + /* Take responsibility for the cluster slots. */ + clusterFailoverReplaceYourMaster(); + } else { + clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_VOTES); + } +} + +/* ----------------------------------------------------------------------------- + * CLUSTER slave migration + * + * Slave migration is the process that allows a slave of a master that is + * already covered by at least another slave, to "migrate" to a master that + * is orphaned, that is, left with no working slaves. + * ------------------------------------------------------------------------- */ + +/* This function is responsible to decide if this replica should be migrated + * to a different (orphaned) master. It is called by the clusterCron() function + * only if: + * + * 1) We are a slave node. + * 2) It was detected that there is at least one orphaned master in + * the cluster. + * 3) We are a slave of one of the masters with the greatest number of + * slaves. + * + * This checks are performed by the caller since it requires to iterate + * the nodes anyway, so we spend time into clusterHandleSlaveMigration() + * if definitely needed. + * + * The function is called with a pre-computed max_slaves, that is the max + * number of working (not in FAIL state) slaves for a single master. + * + * Additional conditions for migration are examined inside the function. + */ +void clusterHandleSlaveMigration(int max_slaves) { + int j, okslaves = 0; + clusterNode *mymaster = myself->slaveof, *target = NULL, *candidate = NULL; + dictIterator *di; + dictEntry *de; + + /* Step 1: Don't migrate if the cluster state is not ok. */ + if (server.cluster->state != CLUSTER_OK) return; + + /* Step 2: Don't migrate if my master will not be left with at least + * 'migration-barrier' slaves after my migration. */ + if (mymaster == NULL) return; + for (j = 0; j < mymaster->numslaves; j++) + if (!nodeFailed(mymaster->slaves[j]) && + !nodeTimedOut(mymaster->slaves[j])) okslaves++; + if (okslaves <= server.cluster_migration_barrier) return; + + /* Step 3: Identify a candidate for migration, and check if among the + * masters with the greatest number of ok slaves, I'm the one with the + * smallest node ID (the "candidate slave"). + * + * Note: this means that eventually a replica migration will occur + * since slaves that are reachable again always have their FAIL flag + * cleared, so eventually there must be a candidate. + * There is a possible race condition causing multiple + * slaves to migrate at the same time, but this is unlikely to + * happen and relatively harmless when it does. */ + candidate = myself; + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + int okslaves = 0, is_orphaned = 1; + + /* We want to migrate only if this master is working, orphaned, and + * used to have slaves or if failed over a master that had slaves + * (MIGRATE_TO flag). This way we only migrate to instances that were + * supposed to have replicas. */ + if (nodeIsSlave(node) || nodeFailed(node)) is_orphaned = 0; + if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) is_orphaned = 0; + + /* Check number of working slaves. */ + if (clusterNodeIsMaster(node)) okslaves = clusterCountNonFailingSlaves(node); + if (okslaves > 0) is_orphaned = 0; + + if (is_orphaned) { + if (!target && node->numslots > 0) target = node; + + /* Track the starting time of the orphaned condition for this + * master. */ + if (!node->orphaned_time) node->orphaned_time = mstime(); + } else { + node->orphaned_time = 0; + } + + /* Check if I'm the slave candidate for the migration: attached + * to a master with the maximum number of slaves and with the smallest + * node ID. */ + if (okslaves == max_slaves) { + for (j = 0; j < node->numslaves; j++) { + if (memcmp(node->slaves[j]->name, + candidate->name, + CLUSTER_NAMELEN) < 0) + { + candidate = node->slaves[j]; + } + } + } + } + dictReleaseIterator(di); + + /* Step 4: perform the migration if there is a target, and if I'm the + * candidate, but only if the master is continuously orphaned for a + * couple of seconds, so that during failovers, we give some time to + * the natural slaves of this instance to advertise their switch from + * the old master to the new one. */ + if (target && candidate == myself && + (mstime()-target->orphaned_time) > CLUSTER_SLAVE_MIGRATION_DELAY && + !(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) + { + serverLog(LL_NOTICE,"Migrating to orphaned master %.40s", + target->name); + clusterSetMaster(target); + } +} + +/* ----------------------------------------------------------------------------- + * CLUSTER manual failover + * + * This are the important steps performed by slaves during a manual failover: + * 1) User send CLUSTER FAILOVER command. The failover state is initialized + * setting mf_end to the millisecond unix time at which we'll abort the + * attempt. + * 2) Slave sends a MFSTART message to the master requesting to pause clients + * for two times the manual failover timeout CLUSTER_MF_TIMEOUT. + * When master is paused for manual failover, it also starts to flag + * packets with CLUSTERMSG_FLAG0_PAUSED. + * 3) Slave waits for master to send its replication offset flagged as PAUSED. + * 4) If slave received the offset from the master, and its offset matches, + * mf_can_start is set to 1, and clusterHandleSlaveFailover() will perform + * the failover as usually, with the difference that the vote request + * will be modified to force masters to vote for a slave that has a + * working master. + * + * From the point of view of the master things are simpler: when a + * PAUSE_CLIENTS packet is received the master sets mf_end as well and + * the sender in mf_slave. During the time limit for the manual failover + * the master will just send PINGs more often to this slave, flagged with + * the PAUSED flag, so that the slave will set mf_master_offset when receiving + * a packet from the master with this flag set. + * + * The goal of the manual failover is to perform a fast failover without + * data loss due to the asynchronous master-slave replication. + * -------------------------------------------------------------------------- */ + +/* Reset the manual failover state. This works for both masters and slaves + * as all the state about manual failover is cleared. + * + * The function can be used both to initialize the manual failover state at + * startup or to abort a manual failover in progress. */ +void resetManualFailover(void) { + if (server.cluster->mf_slave) { + /* We were a master failing over, so we paused clients and related actions. + * Regardless of the outcome we unpause now to allow traffic again. */ + unpauseActions(PAUSE_DURING_FAILOVER); + } + server.cluster->mf_end = 0; /* No manual failover in progress. */ + server.cluster->mf_can_start = 0; + server.cluster->mf_slave = NULL; + server.cluster->mf_master_offset = -1; +} + +/* If a manual failover timed out, abort it. */ +void manualFailoverCheckTimeout(void) { + if (server.cluster->mf_end && server.cluster->mf_end < mstime()) { + serverLog(LL_WARNING,"Manual failover timed out."); + resetManualFailover(); + } +} + +/* This function is called from the cluster cron function in order to go + * forward with a manual failover state machine. */ +void clusterHandleManualFailover(void) { + /* Return ASAP if no manual failover is in progress. */ + if (server.cluster->mf_end == 0) return; + + /* If mf_can_start is non-zero, the failover was already triggered so the + * next steps are performed by clusterHandleSlaveFailover(). */ + if (server.cluster->mf_can_start) return; + + if (server.cluster->mf_master_offset == -1) return; /* Wait for offset... */ + + if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) { + /* Our replication offset matches the master replication offset + * announced after clients were paused. We can start the failover. */ + server.cluster->mf_can_start = 1; + serverLog(LL_NOTICE, + "All master replication stream processed, " + "manual failover can start."); + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + return; + } + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); +} + +/* ----------------------------------------------------------------------------- + * CLUSTER cron job + * -------------------------------------------------------------------------- */ + +/* Check if the node is disconnected and re-establish the connection. + * Also update a few stats while we are here, that can be used to make + * better decisions in other part of the code. */ +static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_timeout, mstime_t now) { + /* Not interested in reconnecting the link with myself or nodes + * for which we have no address. */ + if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR)) return 1; + + if (node->flags & CLUSTER_NODE_PFAIL) + server.cluster->stats_pfail_nodes++; + + /* A Node in HANDSHAKE state has a limited lifespan equal to the + * configured node timeout. */ + if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) { + clusterDelNode(node); + return 1; + } + + if (node->link == NULL) { + clusterLink *link = createClusterLink(node); + link->conn = connCreate(connTypeOfCluster()); + connSetPrivateData(link->conn, link); + if (connConnect(link->conn, node->ip, node->cport, server.bind_source_addr, + clusterLinkConnectHandler) == C_ERR) { + /* We got a synchronous error from connect before + * clusterSendPing() had a chance to be called. + * If node->ping_sent is zero, failure detection can't work, + * so we claim we actually sent a ping now (that will + * be really sent as soon as the link is obtained). */ + if (node->ping_sent == 0) node->ping_sent = mstime(); + serverLog(LL_DEBUG, "Unable to connect to " + "Cluster Node [%s]:%d -> %s", node->ip, + node->cport, server.neterr); + + freeClusterLink(link); + return 0; + } + } + return 0; +} + +static void freeClusterLinkOnBufferLimitReached(clusterLink *link) { + if (link == NULL || server.cluster_link_msg_queue_limit_bytes == 0) { + return; + } + + unsigned long long mem_link = link->send_msg_queue_mem; + if (mem_link > server.cluster_link_msg_queue_limit_bytes) { + serverLog(LL_WARNING, "Freeing cluster link(%s node %.40s, used memory: %llu) due to " + "exceeding send buffer memory limit.", link->inbound ? "from" : "to", + link->node ? link->node->name : "", mem_link); + freeClusterLink(link); + server.cluster->stat_cluster_links_buffer_limit_exceeded++; + } +} + +/* Free outbound link to a node if its send buffer size exceeded limit. */ +static void clusterNodeCronFreeLinkOnBufferLimitReached(clusterNode *node) { + freeClusterLinkOnBufferLimitReached(node->link); + freeClusterLinkOnBufferLimitReached(node->inbound_link); +} + +/* This is executed 10 times every second */ +void clusterCron(void) { + dictIterator *di; + dictEntry *de; + int update_state = 0; + int orphaned_masters; /* How many masters there are without ok slaves. */ + int max_slaves; /* Max number of ok slaves for a single master. */ + int this_slaves; /* Number of ok slaves for our master (if we are slave). */ + mstime_t min_pong = 0, now = mstime(); + clusterNode *min_pong_node = NULL; + static unsigned long long iteration = 0; + mstime_t handshake_timeout; + + iteration++; /* Number of times this function was called so far. */ + + clusterUpdateMyselfHostname(); + + /* The handshake timeout is the time after which a handshake node that was + * not turned into a normal node is removed from the nodes. Usually it is + * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use + * the value of 1 second. */ + handshake_timeout = server.cluster_node_timeout; + if (handshake_timeout < 1000) handshake_timeout = 1000; + + /* Clear so clusterNodeCronHandleReconnect can count the number of nodes in PFAIL. */ + server.cluster->stats_pfail_nodes = 0; + /* Run through some of the operations we want to do on each cluster node. */ + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + /* We free the inbound or outboud link to the node if the link has an + * oversized message send queue and immediately try reconnecting. */ + clusterNodeCronFreeLinkOnBufferLimitReached(node); + /* The protocol is that function(s) below return non-zero if the node was + * terminated. + */ + if(clusterNodeCronHandleReconnect(node, handshake_timeout, now)) continue; + } + dictReleaseIterator(di); + + /* Ping some random node 1 time every 10 iterations, so that we usually ping + * one random node every second. */ + if (!(iteration % 10)) { + int j; + + /* Check a few random nodes and ping the one with the oldest + * pong_received time. */ + for (j = 0; j < 5; j++) { + de = dictGetRandomKey(server.cluster->nodes); + clusterNode *this = dictGetVal(de); + + /* Don't ping nodes disconnected or with a ping currently active. */ + if (this->link == NULL || this->ping_sent != 0) continue; + if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE)) + continue; + if (min_pong_node == NULL || min_pong > this->pong_received) { + min_pong_node = this; + min_pong = this->pong_received; + } + } + if (min_pong_node) { + serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name); + clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING); + } + } + + /* Iterate nodes to check if we need to flag something as failing. + * This loop is also responsible to: + * 1) Check if there are orphaned masters (masters without non failing + * slaves). + * 2) Count the max number of non failing slaves for a single master. + * 3) Count the number of slaves for our master, if we are a slave. */ + orphaned_masters = 0; + max_slaves = 0; + this_slaves = 0; + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + now = mstime(); /* Use an updated time at every iteration. */ + + if (node->flags & + (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) + continue; + + /* Orphaned master check, useful only if the current instance + * is a slave that may migrate to another master. */ + if (nodeIsSlave(myself) && clusterNodeIsMaster(node) && !nodeFailed(node)) { + int okslaves = clusterCountNonFailingSlaves(node); + + /* A master is orphaned if it is serving a non-zero number of + * slots, have no working slaves, but used to have at least one + * slave, or failed over a master that used to have slaves. */ + if (okslaves == 0 && node->numslots > 0 && + node->flags & CLUSTER_NODE_MIGRATE_TO) + { + orphaned_masters++; + } + if (okslaves > max_slaves) max_slaves = okslaves; + if (myself->slaveof == node) + this_slaves = okslaves; + } + + /* If we are not receiving any data for more than half the cluster + * timeout, reconnect the link: maybe there is a connection + * issue even if the node is alive. */ + mstime_t ping_delay = now - node->ping_sent; + mstime_t data_delay = now - node->data_received; + if (node->link && /* is connected */ + now - node->link->ctime > + server.cluster_node_timeout && /* was not already reconnected */ + node->ping_sent && /* we already sent a ping */ + /* and we are waiting for the pong more than timeout/2 */ + ping_delay > server.cluster_node_timeout/2 && + /* and in such interval we are not seeing any traffic at all. */ + data_delay > server.cluster_node_timeout/2) + { + /* Disconnect the link, it will be reconnected automatically. */ + freeClusterLink(node->link); + } + + /* If we have currently no active ping in this instance, and the + * received PONG is older than half the cluster timeout, send + * a new ping now, to ensure all the nodes are pinged without + * a too big delay. */ + mstime_t ping_interval = server.cluster_ping_interval ? + server.cluster_ping_interval : server.cluster_node_timeout/2; + if (node->link && + node->ping_sent == 0 && + (now - node->pong_received) > ping_interval) + { + clusterSendPing(node->link, CLUSTERMSG_TYPE_PING); + continue; + } + + /* If we are a master and one of the slaves requested a manual + * failover, ping it continuously. */ + if (server.cluster->mf_end && + clusterNodeIsMaster(myself) && + server.cluster->mf_slave == node && + node->link) + { + clusterSendPing(node->link, CLUSTERMSG_TYPE_PING); + continue; + } + + /* Check only if we have an active ping for this instance. */ + if (node->ping_sent == 0) continue; + + /* Check if this node looks unreachable. + * Note that if we already received the PONG, then node->ping_sent + * is zero, so can't reach this code at all, so we don't risk of + * checking for a PONG delay if we didn't sent the PING. + * + * We also consider every incoming data as proof of liveness, since + * our cluster bus link is also used for data: under heavy data + * load pong delays are possible. */ + mstime_t node_delay = (ping_delay < data_delay) ? ping_delay : + data_delay; + + if (node_delay > server.cluster_node_timeout) { + /* Timeout reached. Set the node as possibly failing if it is + * not already in this state. */ + if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) { + node->flags |= CLUSTER_NODE_PFAIL; + update_state = 1; + if (clusterNodeIsMaster(myself) && server.cluster->size == 1) { + markNodeAsFailingIfNeeded(node); + } else { + serverLog(LL_DEBUG,"*** NODE %.40s possibly failing", node->name); + } + } + } + } + dictReleaseIterator(di); + + /* If we are a slave node but the replication is still turned off, + * enable it if we know the address of our master and it appears to + * be up. */ + if (nodeIsSlave(myself) && + server.masterhost == NULL && + myself->slaveof && + nodeHasAddr(myself->slaveof)) + { + replicationSetMaster(myself->slaveof->ip, getNodeDefaultReplicationPort(myself->slaveof)); + } + + /* Abort a manual failover if the timeout is reached. */ + manualFailoverCheckTimeout(); + + if (nodeIsSlave(myself)) { + clusterHandleManualFailover(); + if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) + clusterHandleSlaveFailover(); + /* If there are orphaned slaves, and we are a slave among the masters + * with the max number of non-failing slaves, consider migrating to + * the orphaned masters. Note that it does not make sense to try + * a migration if there is no master with at least *two* working + * slaves. */ + if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves && + server.cluster_allow_replica_migration) + clusterHandleSlaveMigration(max_slaves); + } + + if (update_state || server.cluster->state == CLUSTER_FAIL) + clusterUpdateState(); +} + +/* This function is called before the event handler returns to sleep for + * events. It is useful to perform operations that must be done ASAP in + * reaction to events fired but that are not safe to perform inside event + * handlers, or to perform potentially expansive tasks that we need to do + * a single time before replying to clients. */ +void clusterBeforeSleep(void) { + int flags = server.cluster->todo_before_sleep; + + /* Reset our flags (not strictly needed since every single function + * called for flags set should be able to clear its flag). */ + server.cluster->todo_before_sleep = 0; + + if (flags & CLUSTER_TODO_HANDLE_MANUALFAILOVER) { + /* Handle manual failover as soon as possible so that won't have a 100ms + * as it was handled only in clusterCron */ + if(nodeIsSlave(myself)) { + clusterHandleManualFailover(); + if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER)) + clusterHandleSlaveFailover(); + } + } else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) { + /* Handle failover, this is needed when it is likely that there is already + * the quorum from masters in order to react fast. */ + clusterHandleSlaveFailover(); + } + + /* Update the cluster state. */ + if (flags & CLUSTER_TODO_UPDATE_STATE) + clusterUpdateState(); + + /* Save the config, possibly using fsync. */ + if (flags & CLUSTER_TODO_SAVE_CONFIG) { + int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG; + clusterSaveConfigOrDie(fsync); + } +} + +void clusterDoBeforeSleep(int flags) { + server.cluster->todo_before_sleep |= flags; +} + +/* ----------------------------------------------------------------------------- + * Slots management + * -------------------------------------------------------------------------- */ + +/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is set, + * otherwise 0. */ +int bitmapTestBit(unsigned char *bitmap, int pos) { + off_t byte = pos/8; + int bit = pos&7; + return (bitmap[byte] & (1<nodes); + dictEntry *de; + int slaves = 0; + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (nodeIsSlave(node)) continue; + slaves += node->numslaves; + } + dictReleaseIterator(di); + return slaves != 0; +} + +/* Set the slot bit and return the old value. */ +int clusterNodeSetSlotBit(clusterNode *n, int slot) { + int old = bitmapTestBit(n->slots,slot); + if (!old) { + bitmapSetBit(n->slots,slot); + n->numslots++; + /* When a master gets its first slot, even if it has no slaves, + * it gets flagged with MIGRATE_TO, that is, the master is a valid + * target for replicas migration, if and only if at least one of + * the other masters has slaves right now. + * + * Normally masters are valid targets of replica migration if: + * 1. The used to have slaves (but no longer have). + * 2. They are slaves failing over a master that used to have slaves. + * + * However new masters with slots assigned are considered valid + * migration targets if the rest of the cluster is not a slave-less. + * + * See https://github.com/redis/redis/issues/3043 for more info. */ + if (n->numslots == 1 && clusterMastersHaveSlaves()) + n->flags |= CLUSTER_NODE_MIGRATE_TO; + } + return old; +} + +/* Clear the slot bit and return the old value. */ +int clusterNodeClearSlotBit(clusterNode *n, int slot) { + int old = bitmapTestBit(n->slots,slot); + if (old) { + bitmapClearBit(n->slots,slot); + n->numslots--; + } + return old; +} + +/* Return the slot bit from the cluster node structure. */ +int clusterNodeCoversSlot(clusterNode *n, int slot) { + return bitmapTestBit(n->slots,slot); +} + +/* Add the specified slot to the list of slots that node 'n' will + * serve. Return C_OK if the operation ended with success. + * If the slot is already assigned to another instance this is considered + * an error and C_ERR is returned. */ +int clusterAddSlot(clusterNode *n, int slot) { + if (server.cluster->slots[slot]) return C_ERR; + clusterNodeSetSlotBit(n,slot); + server.cluster->slots[slot] = n; + return C_OK; +} + +/* Delete the specified slot marking it as unassigned. + * Returns C_OK if the slot was assigned, otherwise if the slot was + * already unassigned C_ERR is returned. */ +int clusterDelSlot(int slot) { + clusterNode *n = server.cluster->slots[slot]; + + if (!n) return C_ERR; + + /* Cleanup the channels in master/replica as part of slot deletion. */ + removeChannelsInSlot(slot); + /* Clear the slot bit. */ + serverAssert(clusterNodeClearSlotBit(n,slot) == 1); + server.cluster->slots[slot] = NULL; + /* Make owner_not_claiming_slot flag consistent with slot ownership information. */ + bitmapClearBit(server.cluster->owner_not_claiming_slot, slot); + return C_OK; +} + +/* Transfer slots from `from_node` to `to_node`. + * Iterates over all cluster slots, transferring each slot covered by `from_node` to `to_node`. + * Counts and returns the number of slots transferred. */ +int clusterMoveNodeSlots(clusterNode *from_node, clusterNode *to_node) { + int processed = 0; + + for (int j = 0; j < CLUSTER_SLOTS; j++) { + if (clusterNodeCoversSlot(from_node, j)) { + clusterDelSlot(j); + clusterAddSlot(to_node, j); + processed++; + } + } + return processed; +} + +/* Delete all the slots associated with the specified node. + * The number of deleted slots is returned. */ +int clusterDelNodeSlots(clusterNode *node) { + int deleted = 0, j; + + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (clusterNodeCoversSlot(node, j)) { + clusterDelSlot(j); + deleted++; + } + } + return deleted; +} + +/* Clear the migrating / importing state for all the slots. + * This is useful at initialization and when turning a master into slave. */ +void clusterCloseAllSlots(void) { + memset(server.cluster->migrating_slots_to,0, + sizeof(server.cluster->migrating_slots_to)); + memset(server.cluster->importing_slots_from,0, + sizeof(server.cluster->importing_slots_from)); +} + +/* ----------------------------------------------------------------------------- + * Cluster state evaluation function + * -------------------------------------------------------------------------- */ + +/* The following are defines that are only used in the evaluation function + * and are based on heuristics. Actually the main point about the rejoin and + * writable delay is that they should be a few orders of magnitude larger + * than the network latency. */ +#define CLUSTER_MAX_REJOIN_DELAY 5000 +#define CLUSTER_MIN_REJOIN_DELAY 500 +#define CLUSTER_WRITABLE_DELAY 2000 + +void clusterUpdateState(void) { + int j, new_state; + int reachable_masters = 0; + static mstime_t among_minority_time; + static mstime_t first_call_time = 0; + + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_UPDATE_STATE; + + /* If this is a master node, wait some time before turning the state + * into OK, since it is not a good idea to rejoin the cluster as a writable + * master, after a reboot, without giving the cluster a chance to + * reconfigure this node. Note that the delay is calculated starting from + * the first call to this function and not since the server start, in order + * to not count the DB loading time. */ + if (first_call_time == 0) first_call_time = mstime(); + if (clusterNodeIsMaster(myself) && + server.cluster->state == CLUSTER_FAIL && + mstime() - first_call_time < CLUSTER_WRITABLE_DELAY) return; + + /* Start assuming the state is OK. We'll turn it into FAIL if there + * are the right conditions. */ + new_state = CLUSTER_OK; + + /* Check if all the slots are covered. */ + if (server.cluster_require_full_coverage) { + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (server.cluster->slots[j] == NULL || + server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) + { + new_state = CLUSTER_FAIL; + break; + } + } + } + + /* Compute the cluster size, that is the number of master nodes + * serving at least a single slot. + * + * At the same time count the number of reachable masters having + * at least one slot. */ + { + dictIterator *di; + dictEntry *de; + + server.cluster->size = 0; + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (clusterNodeIsMaster(node) && node->numslots) { + server.cluster->size++; + if ((node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) == 0) + reachable_masters++; + } + } + dictReleaseIterator(di); + } + + /* If we are in a minority partition, change the cluster state + * to FAIL. */ + { + int needed_quorum = (server.cluster->size / 2) + 1; + + if (reachable_masters < needed_quorum) { + new_state = CLUSTER_FAIL; + among_minority_time = mstime(); + } + } + + /* Log a state change */ + if (new_state != server.cluster->state) { + mstime_t rejoin_delay = server.cluster_node_timeout; + + /* If the instance is a master and was partitioned away with the + * minority, don't let it accept queries for some time after the + * partition heals, to make sure there is enough time to receive + * a configuration update. */ + if (rejoin_delay > CLUSTER_MAX_REJOIN_DELAY) + rejoin_delay = CLUSTER_MAX_REJOIN_DELAY; + if (rejoin_delay < CLUSTER_MIN_REJOIN_DELAY) + rejoin_delay = CLUSTER_MIN_REJOIN_DELAY; + + if (new_state == CLUSTER_OK && + clusterNodeIsMaster(myself) && + mstime() - among_minority_time < rejoin_delay) + { + return; + } + + /* Change the state and log the event. */ + serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, + "Cluster state changed: %s", + new_state == CLUSTER_OK ? "ok" : "fail"); + server.cluster->state = new_state; + } +} + +/* This function is called after the node startup in order to verify that data + * loaded from disk is in agreement with the cluster configuration: + * + * 1) If we find keys about hash slots we have no responsibility for, the + * following happens: + * A) If no other node is in charge according to the current cluster + * configuration, we add these slots to our node. + * B) If according to our config other nodes are already in charge for + * this slots, we set the slots as IMPORTING from our point of view + * in order to justify we have those slots, and in order to make + * redis-cli aware of the issue, so that it can try to fix it. + * 2) If we find data in a DB different than DB0 we return C_ERR to + * signal the caller it should quit the server with an error message + * or take other actions. + * + * The function always returns C_OK even if it will try to correct + * the error described in "1". However if data is found in DB different + * from DB0, C_ERR is returned. + * + * The function also uses the logging facility in order to warn the user + * about desynchronizations between the data we have in memory and the + * cluster configuration. */ +int verifyClusterConfigWithData(void) { + int j; + int update_config = 0; + + /* Return ASAP if a module disabled cluster redirections. In that case + * every master can store keys about every possible hash slot. */ + if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) + return C_OK; + + /* If this node is a slave, don't perform the check at all as we + * completely depend on the replication stream. */ + if (nodeIsSlave(myself)) return C_OK; + + /* Make sure we only have keys in DB0. */ + for (j = 1; j < server.dbnum; j++) { + if (kvstoreSize(server.db[j].keys)) return C_ERR; + } + + /* Check that all the slots we see populated memory have a corresponding + * entry in the cluster table. Otherwise fix the table. */ + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (!countKeysInSlot(j)) continue; /* No keys in this slot. */ + /* Check if we are assigned to this slot or if we are importing it. + * In both cases check the next slot as the configuration makes + * sense. */ + if (server.cluster->slots[j] == myself || + server.cluster->importing_slots_from[j] != NULL) continue; + + /* If we are here data and cluster config don't agree, and we have + * slot 'j' populated even if we are not importing it, nor we are + * assigned to this slot. Fix this condition. */ + + update_config++; + /* Case A: slot is unassigned. Take responsibility for it. */ + if (server.cluster->slots[j] == NULL) { + serverLog(LL_NOTICE, "I have keys for unassigned slot %d. " + "Taking responsibility for it.",j); + clusterAddSlot(myself,j); + } else { + serverLog(LL_NOTICE, "I have keys for slot %d, but the slot is " + "assigned to another node. " + "Setting it to importing state.",j); + server.cluster->importing_slots_from[j] = server.cluster->slots[j]; + } + } + if (update_config) clusterSaveConfigOrDie(1); + return C_OK; +} + +/* Remove all the shard channel related information not owned by the current shard. */ +static inline void removeAllNotOwnedShardChannelSubscriptions(void) { + if (!kvstoreSize(server.pubsubshard_channels)) return; + clusterNode *currmaster = clusterNodeIsMaster(myself) ? myself : myself->slaveof; + for (int j = 0; j < CLUSTER_SLOTS; j++) { + if (server.cluster->slots[j] != currmaster) { + removeChannelsInSlot(j); + } + } +} + +/* ----------------------------------------------------------------------------- + * SLAVE nodes handling + * -------------------------------------------------------------------------- */ + +/* Set the specified node 'n' as master for this node. + * If this node is currently a master, it is turned into a slave. */ +void clusterSetMaster(clusterNode *n) { + serverAssert(n != myself); + serverAssert(myself->numslots == 0); + + if (clusterNodeIsMaster(myself)) { + myself->flags &= ~(CLUSTER_NODE_MASTER|CLUSTER_NODE_MIGRATE_TO); + myself->flags |= CLUSTER_NODE_SLAVE; + clusterCloseAllSlots(); + } else { + if (myself->slaveof) + clusterNodeRemoveSlave(myself->slaveof,myself); + } + myself->slaveof = n; + updateShardId(myself, n->shard_id); + clusterNodeAddSlave(n,myself); + replicationSetMaster(n->ip, getNodeDefaultReplicationPort(n)); + removeAllNotOwnedShardChannelSubscriptions(); + resetManualFailover(); +} + +/* ----------------------------------------------------------------------------- + * Nodes to string representation functions. + * -------------------------------------------------------------------------- */ + +struct redisNodeFlags { + uint16_t flag; + char *name; +}; + +static struct redisNodeFlags redisNodeFlagsTable[] = { + {CLUSTER_NODE_MYSELF, "myself,"}, + {CLUSTER_NODE_MASTER, "master,"}, + {CLUSTER_NODE_SLAVE, "slave,"}, + {CLUSTER_NODE_PFAIL, "fail?,"}, + {CLUSTER_NODE_FAIL, "fail,"}, + {CLUSTER_NODE_HANDSHAKE, "handshake,"}, + {CLUSTER_NODE_NOADDR, "noaddr,"}, + {CLUSTER_NODE_NOFAILOVER, "nofailover,"} +}; + +/* Concatenate the comma separated list of node flags to the given SDS + * string 'ci'. */ +sds representClusterNodeFlags(sds ci, uint16_t flags) { + size_t orig_len = sdslen(ci); + int i, size = sizeof(redisNodeFlagsTable)/sizeof(struct redisNodeFlags); + for (i = 0; i < size; i++) { + struct redisNodeFlags *nodeflag = redisNodeFlagsTable + i; + if (flags & nodeflag->flag) ci = sdscat(ci, nodeflag->name); + } + /* If no flag was added, add the "noflags" special flag. */ + if (sdslen(ci) == orig_len) ci = sdscat(ci,"noflags,"); + sdsIncrLen(ci,-1); /* Remove trailing comma. */ + return ci; +} + +/* Concatenate the slot ownership information to the given SDS string 'ci'. + * If the slot ownership is in a contiguous block, it's represented as start-end pair, + * else each slot is added separately. */ +sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count) { + for (int i = 0; i< slot_info_pairs_count; i+=2) { + unsigned long start = slot_info_pairs[i]; + unsigned long end = slot_info_pairs[i+1]; + if (start == end) { + ci = sdscatfmt(ci, " %i", start); + } else { + ci = sdscatfmt(ci, " %i-%i", start, end); + } + } + return ci; +} + +/* Generate a csv-alike representation of the specified cluster node. + * See clusterGenNodesDescription() top comment for more information. + * + * The function returns the string representation as an SDS string. */ +sds clusterGenNodeDescription(client *c, clusterNode *node, int tls_primary) { + int j, start; + sds ci; + int port = clusterNodeClientPort(node, tls_primary); + + /* Node coordinates */ + ci = sdscatlen(sdsempty(),node->name,CLUSTER_NAMELEN); + ci = sdscatfmt(ci," %s:%i@%i", + node->ip, + port, + node->cport); + if (sdslen(node->hostname) != 0) { + ci = sdscatfmt(ci,",%s", node->hostname); + } + /* Don't expose aux fields to any clients yet but do allow them + * to be persisted to nodes.conf */ + if (c == NULL) { + if (sdslen(node->hostname) == 0) { + ci = sdscatfmt(ci,",", 1); + } + for (int i = af_count-1; i >=0; i--) { + if ((tls_primary && i == af_tls_port) || (!tls_primary && i == af_tcp_port)) { + continue; + } + if (auxFieldHandlers[i].isPresent(node)) { + ci = sdscatprintf(ci, ",%s=", auxFieldHandlers[i].field); + ci = auxFieldHandlers[i].getter(node, ci); + } + } + } + + /* Flags */ + ci = sdscatlen(ci," ",1); + ci = representClusterNodeFlags(ci, node->flags); + + /* Slave of... or just "-" */ + ci = sdscatlen(ci," ",1); + if (node->slaveof) + ci = sdscatlen(ci,node->slaveof->name,CLUSTER_NAMELEN); + else + ci = sdscatlen(ci,"-",1); + + unsigned long long nodeEpoch = node->configEpoch; + if (nodeIsSlave(node) && node->slaveof) { + nodeEpoch = node->slaveof->configEpoch; + } + /* Latency from the POV of this node, config epoch, link status */ + ci = sdscatfmt(ci," %I %I %U %s", + (long long) node->ping_sent, + (long long) node->pong_received, + nodeEpoch, + (node->link || node->flags & CLUSTER_NODE_MYSELF) ? + "connected" : "disconnected"); + + /* Slots served by this instance. If we already have slots info, + * append it directly, otherwise, generate slots only if it has. */ + if (node->slot_info_pairs) { + ci = representSlotInfo(ci, node->slot_info_pairs, node->slot_info_pairs_count); + } else if (node->numslots > 0) { + start = -1; + for (j = 0; j < CLUSTER_SLOTS; j++) { + int bit; + + if ((bit = clusterNodeCoversSlot(node, j)) != 0) { + if (start == -1) start = j; + } + if (start != -1 && (!bit || j == CLUSTER_SLOTS-1)) { + if (bit && j == CLUSTER_SLOTS-1) j++; + + if (start == j-1) { + ci = sdscatfmt(ci," %i",start); + } else { + ci = sdscatfmt(ci," %i-%i",start,j-1); + } + start = -1; + } + } + } + + /* Just for MYSELF node we also dump info about slots that + * we are migrating to other instances or importing from other + * instances. */ + if (node->flags & CLUSTER_NODE_MYSELF) { + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (server.cluster->migrating_slots_to[j]) { + ci = sdscatprintf(ci," [%d->-%.40s]",j, + server.cluster->migrating_slots_to[j]->name); + } else if (server.cluster->importing_slots_from[j]) { + ci = sdscatprintf(ci," [%d-<-%.40s]",j, + server.cluster->importing_slots_from[j]->name); + } + } + } + return ci; +} + +/* Generate the slot topology for all nodes and store the string representation + * in the slots_info struct on the node. This is used to improve the efficiency + * of clusterGenNodesDescription() because it removes looping of the slot space + * for generating the slot info for each node individually. */ +void clusterGenNodesSlotsInfo(int filter) { + clusterNode *n = NULL; + int start = -1; + + for (int i = 0; i <= CLUSTER_SLOTS; i++) { + /* Find start node and slot id. */ + if (n == NULL) { + if (i == CLUSTER_SLOTS) break; + n = server.cluster->slots[i]; + start = i; + continue; + } + + /* Generate slots info when occur different node with start + * or end of slot. */ + if (i == CLUSTER_SLOTS || n != server.cluster->slots[i]) { + if (!(n->flags & filter)) { + if (!n->slot_info_pairs) { + n->slot_info_pairs = zmalloc(2 * n->numslots * sizeof(uint16_t)); + } + serverAssert((n->slot_info_pairs_count + 1) < (2 * n->numslots)); + n->slot_info_pairs[n->slot_info_pairs_count++] = start; + n->slot_info_pairs[n->slot_info_pairs_count++] = i-1; + } + if (i == CLUSTER_SLOTS) break; + n = server.cluster->slots[i]; + start = i; + } + } +} + +void clusterFreeNodesSlotsInfo(clusterNode *n) { + zfree(n->slot_info_pairs); + n->slot_info_pairs = NULL; + n->slot_info_pairs_count = 0; +} + +/* Generate a csv-alike representation of the nodes we are aware of, + * including the "myself" node, and return an SDS string containing the + * representation (it is up to the caller to free it). + * + * All the nodes matching at least one of the node flags specified in + * "filter" are excluded from the output, so using zero as a filter will + * include all the known nodes in the representation, including nodes in + * the HANDSHAKE state. + * + * Setting tls_primary to 1 to put TLS port in the main : + * field and put TCP port in aux field, instead of the opposite way. + * + * The representation obtained using this function is used for the output + * of the CLUSTER NODES function, and as format for the cluster + * configuration file (nodes.conf) for a given node. */ +sds clusterGenNodesDescription(client *c, int filter, int tls_primary) { + sds ci = sdsempty(), ni; + dictIterator *di; + dictEntry *de; + + /* Generate all nodes slots info firstly. */ + clusterGenNodesSlotsInfo(filter); + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (node->flags & filter) continue; + ni = clusterGenNodeDescription(c, node, tls_primary); + ci = sdscatsds(ci,ni); + sdsfree(ni); + ci = sdscatlen(ci,"\n",1); + + /* Release slots info. */ + clusterFreeNodesSlotsInfo(node); + } + dictReleaseIterator(di); + return ci; +} + +/* Add to the output buffer of the given client the description of the given cluster link. + * The description is a map with each entry being an attribute of the link. */ +void addReplyClusterLinkDescription(client *c, clusterLink *link) { + addReplyMapLen(c, 6); + + addReplyBulkCString(c, "direction"); + addReplyBulkCString(c, link->inbound ? "from" : "to"); + + /* addReplyClusterLinkDescription is only called for links that have been + * associated with nodes. The association is always bi-directional, so + * in addReplyClusterLinkDescription, link->node should never be NULL. */ + serverAssert(link->node); + sds node_name = sdsnewlen(link->node->name, CLUSTER_NAMELEN); + addReplyBulkCString(c, "node"); + addReplyBulkCString(c, node_name); + sdsfree(node_name); + + addReplyBulkCString(c, "create-time"); + addReplyLongLong(c, link->ctime); + + char events[3], *p; + p = events; + if (link->conn) { + if (connHasReadHandler(link->conn)) *p++ = 'r'; + if (connHasWriteHandler(link->conn)) *p++ = 'w'; + } + *p = '\0'; + addReplyBulkCString(c, "events"); + addReplyBulkCString(c, events); + + addReplyBulkCString(c, "send-buffer-allocated"); + addReplyLongLong(c, link->send_msg_queue_mem); + + addReplyBulkCString(c, "send-buffer-used"); + addReplyLongLong(c, link->send_msg_queue_mem); +} + +/* Add to the output buffer of the given client an array of cluster link descriptions, + * with array entry being a description of a single current cluster link. */ +void addReplyClusterLinksDescription(client *c) { + dictIterator *di; + dictEntry *de; + void *arraylen_ptr = NULL; + int num_links = 0; + + arraylen_ptr = addReplyDeferredLen(c); + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + if (node->link) { + num_links++; + addReplyClusterLinkDescription(c, node->link); + } + if (node->inbound_link) { + num_links++; + addReplyClusterLinkDescription(c, node->inbound_link); + } + } + dictReleaseIterator(di); + + setDeferredArrayLen(c, arraylen_ptr, num_links); +} + +/* ----------------------------------------------------------------------------- + * CLUSTER command + * -------------------------------------------------------------------------- */ + +const char *clusterGetMessageTypeString(int type) { + switch(type) { + case CLUSTERMSG_TYPE_PING: return "ping"; + case CLUSTERMSG_TYPE_PONG: return "pong"; + case CLUSTERMSG_TYPE_MEET: return "meet"; + case CLUSTERMSG_TYPE_FAIL: return "fail"; + case CLUSTERMSG_TYPE_PUBLISH: return "publish"; + case CLUSTERMSG_TYPE_PUBLISHSHARD: return "publishshard"; + case CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST: return "auth-req"; + case CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK: return "auth-ack"; + case CLUSTERMSG_TYPE_UPDATE: return "update"; + case CLUSTERMSG_TYPE_MFSTART: return "mfstart"; + case CLUSTERMSG_TYPE_MODULE: return "module"; + } + return "unknown"; +} + +int getSlotOrReply(client *c, robj *o) { + long long slot; + + if (getLongLongFromObject(o,&slot) != C_OK || + slot < 0 || slot >= CLUSTER_SLOTS) + { + addReplyError(c,"Invalid or out of range slot"); + return -1; + } + return (int) slot; +} + +int checkSlotAssignmentsOrReply(client *c, unsigned char *slots, int del, int start_slot, int end_slot) { + int slot; + for (slot = start_slot; slot <= end_slot; slot++) { + if (del && server.cluster->slots[slot] == NULL) { + addReplyErrorFormat(c,"Slot %d is already unassigned", slot); + return C_ERR; + } else if (!del && server.cluster->slots[slot]) { + addReplyErrorFormat(c,"Slot %d is already busy", slot); + return C_ERR; + } + if (slots[slot]++ == 1) { + addReplyErrorFormat(c,"Slot %d specified multiple times",(int)slot); + return C_ERR; + } + } + return C_OK; +} + +void clusterUpdateSlots(client *c, unsigned char *slots, int del) { + int j; + for (j = 0; j < CLUSTER_SLOTS; j++) { + if (slots[j]) { + int retval; + + /* If this slot was set as importing we can clear this + * state as now we are the real owner of the slot. */ + if (server.cluster->importing_slots_from[j]) + server.cluster->importing_slots_from[j] = NULL; + + retval = del ? clusterDelSlot(j) : + clusterAddSlot(myself,j); + serverAssertWithInfo(c,NULL,retval == C_OK); + } + } +} + +/* Add detailed information of a node to the output buffer of the given client. */ +void addNodeDetailsToShardReply(client *c, clusterNode *node) { + int reply_count = 0; + void *node_replylen = addReplyDeferredLen(c); + addReplyBulkCString(c, "id"); + addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN); + reply_count++; + + if (node->tcp_port) { + addReplyBulkCString(c, "port"); + addReplyLongLong(c, node->tcp_port); + reply_count++; + } + + if (node->tls_port) { + addReplyBulkCString(c, "tls-port"); + addReplyLongLong(c, node->tls_port); + reply_count++; + } + + addReplyBulkCString(c, "ip"); + addReplyBulkCString(c, node->ip); + reply_count++; + + addReplyBulkCString(c, "endpoint"); + addReplyBulkCString(c, clusterNodePreferredEndpoint(node)); + reply_count++; + + if (sdslen(node->hostname) != 0) { + addReplyBulkCString(c, "hostname"); + addReplyBulkCBuffer(c, node->hostname, sdslen(node->hostname)); + reply_count++; + } + + long long node_offset; + if (node->flags & CLUSTER_NODE_MYSELF) { + node_offset = nodeIsSlave(node) ? replicationGetSlaveOffset() : server.master_repl_offset; + } else { + node_offset = node->repl_offset; + } + + addReplyBulkCString(c, "role"); + addReplyBulkCString(c, nodeIsSlave(node) ? "replica" : "master"); + reply_count++; + + addReplyBulkCString(c, "replication-offset"); + addReplyLongLong(c, node_offset); + reply_count++; + + addReplyBulkCString(c, "health"); + const char *health_msg = NULL; + if (nodeFailed(node)) { + health_msg = "fail"; + } else if (nodeIsSlave(node) && node_offset == 0) { + health_msg = "loading"; + } else { + health_msg = "online"; + } + addReplyBulkCString(c, health_msg); + reply_count++; + + setDeferredMapLen(c, node_replylen, reply_count); +} + +/* Add the shard reply of a single shard based off the given primary node. */ +void addShardReplyForClusterShards(client *c, list *nodes) { + serverAssert(listLength(nodes) > 0); + clusterNode *n = listNodeValue(listFirst(nodes)); + addReplyMapLen(c, 2); + addReplyBulkCString(c, "slots"); + + /* Use slot_info_pairs from the primary only */ + n = clusterNodeGetMaster(n); + + if (n->slot_info_pairs != NULL) { + serverAssert((n->slot_info_pairs_count % 2) == 0); + addReplyArrayLen(c, n->slot_info_pairs_count); + for (int i = 0; i < n->slot_info_pairs_count; i++) + addReplyLongLong(c, (unsigned long)n->slot_info_pairs[i]); + } else { + /* If no slot info pair is provided, the node owns no slots */ + addReplyArrayLen(c, 0); + } + + addReplyBulkCString(c, "nodes"); + addReplyArrayLen(c, listLength(nodes)); + listIter li; + listRewind(nodes, &li); + for (listNode *ln = listNext(&li); ln != NULL; ln = listNext(&li)) { + clusterNode *n = listNodeValue(ln); + addNodeDetailsToShardReply(c, n); + clusterFreeNodesSlotsInfo(n); + } +} + +/* Add to the output buffer of the given client, an array of slot (start, end) + * pair owned by the shard, also the primary and set of replica(s) along with + * information about each node. */ +void clusterCommandShards(client *c) { + addReplyArrayLen(c, dictSize(server.cluster->shards)); + /* This call will add slot_info_pairs to all nodes */ + clusterGenNodesSlotsInfo(0); + dictIterator *di = dictGetSafeIterator(server.cluster->shards); + for(dictEntry *de = dictNext(di); de != NULL; de = dictNext(di)) { + addShardReplyForClusterShards(c, dictGetVal(de)); + } + dictReleaseIterator(di); +} + +sds genClusterInfoString(void) { + sds info = sdsempty(); + char *statestr[] = {"ok","fail"}; + int slots_assigned = 0, slots_ok = 0, slots_pfail = 0, slots_fail = 0; + uint64_t myepoch; + int j; + + for (j = 0; j < CLUSTER_SLOTS; j++) { + clusterNode *n = server.cluster->slots[j]; + + if (n == NULL) continue; + slots_assigned++; + if (nodeFailed(n)) { + slots_fail++; + } else if (nodeTimedOut(n)) { + slots_pfail++; + } else { + slots_ok++; + } + } + + myepoch = (nodeIsSlave(myself) && myself->slaveof) ? + myself->slaveof->configEpoch : myself->configEpoch; + + info = sdscatprintf(info, + "cluster_state:%s\r\n" + "cluster_slots_assigned:%d\r\n" + "cluster_slots_ok:%d\r\n" + "cluster_slots_pfail:%d\r\n" + "cluster_slots_fail:%d\r\n" + "cluster_known_nodes:%lu\r\n" + "cluster_size:%d\r\n" + "cluster_current_epoch:%llu\r\n" + "cluster_my_epoch:%llu\r\n" + , statestr[server.cluster->state], + slots_assigned, + slots_ok, + slots_pfail, + slots_fail, + dictSize(server.cluster->nodes), + server.cluster->size, + (unsigned long long) server.cluster->currentEpoch, + (unsigned long long) myepoch + ); + + /* Show stats about messages sent and received. */ + long long tot_msg_sent = 0; + long long tot_msg_received = 0; + + for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { + if (server.cluster->stats_bus_messages_sent[i] == 0) continue; + tot_msg_sent += server.cluster->stats_bus_messages_sent[i]; + info = sdscatprintf(info, + "cluster_stats_messages_%s_sent:%lld\r\n", + clusterGetMessageTypeString(i), + server.cluster->stats_bus_messages_sent[i]); + } + info = sdscatprintf(info, + "cluster_stats_messages_sent:%lld\r\n", tot_msg_sent); + + for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { + if (server.cluster->stats_bus_messages_received[i] == 0) continue; + tot_msg_received += server.cluster->stats_bus_messages_received[i]; + info = sdscatprintf(info, + "cluster_stats_messages_%s_received:%lld\r\n", + clusterGetMessageTypeString(i), + server.cluster->stats_bus_messages_received[i]); + } + info = sdscatprintf(info, + "cluster_stats_messages_received:%lld\r\n", tot_msg_received); + + info = sdscatprintf(info, + "total_cluster_links_buffer_limit_exceeded:%llu\r\n", + server.cluster->stat_cluster_links_buffer_limit_exceeded); + + return info; +} + + +void removeChannelsInSlot(unsigned int slot) { + if (countChannelsInSlot(slot) == 0) return; + + pubsubShardUnsubscribeAllChannelsInSlot(slot); +} + +/* Remove all the keys in the specified hash slot. + * The number of removed items is returned. */ +unsigned int delKeysInSlot(unsigned int hashslot) { + if (!kvstoreDictSize(server.db->keys, hashslot)) + return 0; + + unsigned int j = 0; + + kvstoreDictIterator *kvs_di = NULL; + dictEntry *de = NULL; + kvs_di = kvstoreGetDictSafeIterator(server.db->keys, hashslot); + while((de = kvstoreDictIteratorNext(kvs_di)) != NULL) { + enterExecutionUnit(1, 0); + sds sdskey = dictGetKey(de); + robj *key = createStringObject(sdskey, sdslen(sdskey)); + dbDelete(&server.db[0], key); + propagateDeletion(&server.db[0], key, server.lazyfree_lazy_server_del); + signalModifiedKey(NULL, &server.db[0], key); + /* The keys are not actually logically deleted from the database, just moved to another node. + * The modules needs to know that these keys are no longer available locally, so just send the + * keyspace notification to the modules, but not to clients. */ + moduleNotifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, server.db[0].id); + exitExecutionUnit(); + postExecutionUnitOperations(); + decrRefCount(key); + j++; + server.dirty++; + } + kvstoreReleaseDictIterator(kvs_di); + + return j; +} + +/* Get the count of the channels for a given slot. */ +unsigned int countChannelsInSlot(unsigned int hashslot) { + return kvstoreDictSize(server.pubsubshard_channels, hashslot); +} + +int clusterNodeIsMyself(clusterNode *n) { + return n == server.cluster->myself; +} + +clusterNode *getMyClusterNode(void) { + return server.cluster->myself; +} + +int clusterManualFailoverTimeLimit(void) { + return server.cluster->mf_end; +} + +int getClusterSize(void) { + return dictSize(server.cluster->nodes); +} + +int getMyShardSlotCount(void) { + if (!nodeIsSlave(server.cluster->myself)) { + return server.cluster->myself->numslots; + } else if (server.cluster->myself->slaveof) { + return server.cluster->myself->slaveof->numslots; + } else { + return 0; + } +} + +char **getClusterNodesList(size_t *numnodes) { + size_t count = dictSize(server.cluster->nodes); + char **ids = zmalloc((count+1)*CLUSTER_NAMELEN); + dictIterator *di = dictGetIterator(server.cluster->nodes); + dictEntry *de; + int j = 0; + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + if (node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) continue; + ids[j] = zmalloc(CLUSTER_NAMELEN); + memcpy(ids[j],node->name,CLUSTER_NAMELEN); + j++; + } + *numnodes = j; + ids[j] = NULL; /* Null term so that FreeClusterNodesList does not need + * to also get the count argument. */ + dictReleaseIterator(di); + return ids; +} + +int clusterNodeIsMaster(clusterNode *n) { + return n->flags & CLUSTER_NODE_MASTER; +} + +int handleDebugClusterCommand(client *c) { + if (strcasecmp(c->argv[1]->ptr, "CLUSTERLINK") || + strcasecmp(c->argv[2]->ptr, "KILL") || + c->argc != 5) { + return 0; + } + + if (!server.cluster_enabled) { + addReplyError(c, "Debug option only available for cluster mode enabled setup!"); + return 1; + } + + /* Find the node. */ + clusterNode *n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (!n) { + addReplyErrorFormat(c, "Unknown node %s", (char *) c->argv[4]->ptr); + return 1; + } + + /* Terminate the link based on the direction or all. */ + if (!strcasecmp(c->argv[3]->ptr, "from")) { + if (n->inbound_link) freeClusterLink(n->inbound_link); + } else if (!strcasecmp(c->argv[3]->ptr, "to")) { + if (n->link) freeClusterLink(n->link); + } else if (!strcasecmp(c->argv[3]->ptr, "all")) { + if (n->link) freeClusterLink(n->link); + if (n->inbound_link) freeClusterLink(n->inbound_link); + } else { + addReplyErrorFormat(c, "Unknown direction %s", (char *) c->argv[3]->ptr); + } + addReply(c, shared.ok); + + return 1; +} + +int clusterNodePending(clusterNode *node) { + return node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE); +} + +char *clusterNodeIp(clusterNode *node) { + return node->ip; +} + +int clusterNodeIsSlave(clusterNode *node) { + return node->flags & CLUSTER_NODE_SLAVE; +} + +clusterNode *clusterNodeGetSlaveof(clusterNode *node) { + return node->slaveof; +} + +clusterNode *clusterNodeGetMaster(clusterNode *node) { + while (node->slaveof != NULL) node = node->slaveof; + return node; +} + +char *clusterNodeGetName(clusterNode *node) { + return node->name; +} + +int clusterNodeTimedOut(clusterNode *node) { + return nodeTimedOut(node); +} + +int clusterNodeIsFailing(clusterNode *node) { + return nodeFailed(node); +} + +int clusterNodeIsNoFailover(clusterNode *node) { + return node->flags & CLUSTER_NODE_NOFAILOVER; +} + +const char **clusterDebugCommandExtendedHelp(void) { + static const char *help[] = { + "CLUSTERLINK KILL ", + " Kills the link based on the direction to/from (both) with the provided node.", + NULL + }; + + return help; +} + +char *clusterNodeGetShardId(clusterNode *node) { + return node->shard_id; +} + +int clusterCommandSpecial(client *c) { + if (!strcasecmp(c->argv[1]->ptr,"meet") && (c->argc == 4 || c->argc == 5)) { + /* CLUSTER MEET [cport] */ + long long port, cport; + + if (getLongLongFromObject(c->argv[3], &port) != C_OK) { + addReplyErrorFormat(c,"Invalid base port specified: %s", + (char*)c->argv[3]->ptr); + return 1; + } + + if (c->argc == 5) { + if (getLongLongFromObject(c->argv[4], &cport) != C_OK) { + addReplyErrorFormat(c,"Invalid bus port specified: %s", + (char*)c->argv[4]->ptr); + return 1; + } + } else { + cport = port + CLUSTER_PORT_INCR; + } + + if (clusterStartHandshake(c->argv[2]->ptr,port,cport) == 0 && + errno == EINVAL) + { + addReplyErrorFormat(c,"Invalid node address specified: %s:%s", + (char*)c->argv[2]->ptr, (char*)c->argv[3]->ptr); + } else { + addReply(c,shared.ok); + } + } else if (!strcasecmp(c->argv[1]->ptr,"flushslots") && c->argc == 2) { + /* CLUSTER FLUSHSLOTS */ + if (kvstoreSize(server.db[0].keys) != 0) { + addReplyError(c,"DB must be empty to perform CLUSTER FLUSHSLOTS."); + return 1; + } + clusterDelNodeSlots(myself); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if ((!strcasecmp(c->argv[1]->ptr,"addslots") || + !strcasecmp(c->argv[1]->ptr,"delslots")) && c->argc >= 3) { + /* CLUSTER ADDSLOTS [slot] ... */ + /* CLUSTER DELSLOTS [slot] ... */ + int j, slot; + unsigned char *slots = zmalloc(CLUSTER_SLOTS); + int del = !strcasecmp(c->argv[1]->ptr,"delslots"); + + memset(slots,0,CLUSTER_SLOTS); + /* Check that all the arguments are parseable.*/ + for (j = 2; j < c->argc; j++) { + if ((slot = getSlotOrReply(c,c->argv[j])) == C_ERR) { + zfree(slots); + return 1; + } + } + /* Check that the slots are not already busy. */ + for (j = 2; j < c->argc; j++) { + slot = getSlotOrReply(c,c->argv[j]); + if (checkSlotAssignmentsOrReply(c, slots, del, slot, slot) == C_ERR) { + zfree(slots); + return 1; + } + } + clusterUpdateSlots(c, slots, del); + zfree(slots); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if ((!strcasecmp(c->argv[1]->ptr,"addslotsrange") || + !strcasecmp(c->argv[1]->ptr,"delslotsrange")) && c->argc >= 4) { + if (c->argc % 2 == 1) { + addReplyErrorArity(c); + return 1; + } + /* CLUSTER ADDSLOTSRANGE [ ...] */ + /* CLUSTER DELSLOTSRANGE [ ...] */ + int j, startslot, endslot; + unsigned char *slots = zmalloc(CLUSTER_SLOTS); + int del = !strcasecmp(c->argv[1]->ptr,"delslotsrange"); + + memset(slots,0,CLUSTER_SLOTS); + /* Check that all the arguments are parseable and that all the + * slots are not already busy. */ + for (j = 2; j < c->argc; j += 2) { + if ((startslot = getSlotOrReply(c,c->argv[j])) == C_ERR) { + zfree(slots); + return 1; + } + if ((endslot = getSlotOrReply(c,c->argv[j+1])) == C_ERR) { + zfree(slots); + return 1; + } + if (startslot > endslot) { + addReplyErrorFormat(c,"start slot number %d is greater than end slot number %d", startslot, endslot); + zfree(slots); + return 1; + } + + if (checkSlotAssignmentsOrReply(c, slots, del, startslot, endslot) == C_ERR) { + zfree(slots); + return 1; + } + } + clusterUpdateSlots(c, slots, del); + zfree(slots); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"setslot") && c->argc >= 4) { + /* SETSLOT 10 MIGRATING */ + /* SETSLOT 10 IMPORTING */ + /* SETSLOT 10 STABLE */ + /* SETSLOT 10 NODE */ + int slot; + clusterNode *n; + + if (nodeIsSlave(myself)) { + addReplyError(c,"Please use SETSLOT only with masters."); + return 1; + } + + if ((slot = getSlotOrReply(c, c->argv[2])) == -1) return 1; + + if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) { + if (server.cluster->slots[slot] != myself) { + addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot); + return 1; + } + n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (n == NULL) { + addReplyErrorFormat(c,"I don't know about node %s", + (char*)c->argv[4]->ptr); + return 1; + } + if (nodeIsSlave(n)) { + addReplyError(c,"Target node is not a master"); + return 1; + } + server.cluster->migrating_slots_to[slot] = n; + } else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) { + if (server.cluster->slots[slot] == myself) { + addReplyErrorFormat(c, + "I'm already the owner of hash slot %u",slot); + return 1; + } + n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (n == NULL) { + addReplyErrorFormat(c,"I don't know about node %s", + (char*)c->argv[4]->ptr); + return 1; + } + if (nodeIsSlave(n)) { + addReplyError(c,"Target node is not a master"); + return 1; + } + server.cluster->importing_slots_from[slot] = n; + } else if (!strcasecmp(c->argv[3]->ptr,"stable") && c->argc == 4) { + /* CLUSTER SETSLOT STABLE */ + server.cluster->importing_slots_from[slot] = NULL; + server.cluster->migrating_slots_to[slot] = NULL; + } else if (!strcasecmp(c->argv[3]->ptr,"node") && c->argc == 5) { + /* CLUSTER SETSLOT NODE */ + n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", + (char*)c->argv[4]->ptr); + return 1; + } + if (nodeIsSlave(n)) { + addReplyError(c,"Target node is not a master"); + return 1; + } + /* If this hash slot was served by 'myself' before to switch + * make sure there are no longer local keys for this hash slot. */ + if (server.cluster->slots[slot] == myself && n != myself) { + if (countKeysInSlot(slot) != 0) { + addReplyErrorFormat(c, + "Can't assign hashslot %d to a different node " + "while I still hold keys for this hash slot.", slot); + return 1; + } + } + /* If this slot is in migrating status but we have no keys + * for it assigning the slot to another node will clear + * the migrating status. */ + if (countKeysInSlot(slot) == 0 && + server.cluster->migrating_slots_to[slot]) + server.cluster->migrating_slots_to[slot] = NULL; + + int slot_was_mine = server.cluster->slots[slot] == myself; + clusterDelSlot(slot); + clusterAddSlot(n,slot); + + /* If we are a master left without slots, we should turn into a + * replica of the new master. */ + if (slot_was_mine && + n != myself && + myself->numslots == 0 && + server.cluster_allow_replica_migration) { + serverLog(LL_NOTICE, + "Configuration change detected. Reconfiguring myself " + "as a replica of %.40s (%s)", n->name, n->human_nodename); + clusterSetMaster(n); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | + CLUSTER_TODO_UPDATE_STATE | + CLUSTER_TODO_FSYNC_CONFIG); + } + + /* If this node was importing this slot, assigning the slot to + * itself also clears the importing status. */ + if (n == myself && + server.cluster->importing_slots_from[slot]) { + /* This slot was manually migrated, set this node configEpoch + * to a new epoch so that the new version can be propagated + * by the cluster. + * + * Note that if this ever results in a collision with another + * node getting the same configEpoch, for example because a + * failover happens at the same time we close the slot, the + * configEpoch collision resolution will fix it assigning + * a different epoch to each node. */ + if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { + serverLog(LL_NOTICE, + "configEpoch updated after importing slot %d", slot); + } + server.cluster->importing_slots_from[slot] = NULL; + /* After importing this slot, let the other nodes know as + * soon as possible. */ + clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + } + } else { + addReplyError(c, + "Invalid CLUSTER SETSLOT action or number of arguments. Try CLUSTER HELP"); + return 1; + } + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"bumpepoch") && c->argc == 2) { + /* CLUSTER BUMPEPOCH */ + int retval = clusterBumpConfigEpochWithoutConsensus(); + sds reply = sdscatprintf(sdsempty(),"+%s %llu\r\n", + (retval == C_OK) ? "BUMPED" : "STILL", + (unsigned long long) myself->configEpoch); + addReplySds(c,reply); + } else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) { + int retval = clusterSaveConfig(1); + + if (retval == 0) + addReply(c,shared.ok); + else + addReplyErrorFormat(c,"error saving the cluster node config: %s", + strerror(errno)); + } else if (!strcasecmp(c->argv[1]->ptr,"forget") && c->argc == 3) { + /* CLUSTER FORGET */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + if (!n) { + if (clusterBlacklistExists((char*)c->argv[2]->ptr)) + /* Already forgotten. The deletion may have been gossipped by + * another node, so we pretend it succeeded. */ + addReply(c,shared.ok); + else + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return 1; + } else if (n == myself) { + addReplyError(c,"I tried hard but I can't forget myself..."); + return 1; + } else if (nodeIsSlave(myself) && myself->slaveof == n) { + addReplyError(c,"Can't forget my master!"); + return 1; + } + clusterBlacklistAddNode(n); + clusterDelNode(n); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) { + /* CLUSTER REPLICATE */ + /* Lookup the specified node in our table. */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return 1; + } + + /* I can't replicate myself. */ + if (n == myself) { + addReplyError(c,"Can't replicate myself"); + return 1; + } + + /* Can't replicate a slave. */ + if (nodeIsSlave(n)) { + addReplyError(c,"I can only replicate a master, not a replica."); + return 1; + } + + /* If the instance is currently a master, it should have no assigned + * slots nor keys to accept to replicate some other node. + * Slaves can switch to another master without issues. */ + if (clusterNodeIsMaster(myself) && + (myself->numslots != 0 || kvstoreSize(server.db[0].keys) != 0)) { + addReplyError(c, + "To set a master the node must be empty and " + "without assigned slots."); + return 1; + } + + /* Set the master. */ + clusterSetMaster(n); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") && + c->argc == 3) + { + /* CLUSTER COUNT-FAILURE-REPORTS */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return 1; + } else { + addReplyLongLong(c,clusterNodeFailureReportsCount(n)); + } + } else if (!strcasecmp(c->argv[1]->ptr,"failover") && + (c->argc == 2 || c->argc == 3)) + { + /* CLUSTER FAILOVER [FORCE|TAKEOVER] */ + int force = 0, takeover = 0; + + if (c->argc == 3) { + if (!strcasecmp(c->argv[2]->ptr,"force")) { + force = 1; + } else if (!strcasecmp(c->argv[2]->ptr,"takeover")) { + takeover = 1; + force = 1; /* Takeover also implies force. */ + } else { + addReplyErrorObject(c,shared.syntaxerr); + return 1; + } + } + + /* Check preconditions. */ + if (clusterNodeIsMaster(myself)) { + addReplyError(c,"You should send CLUSTER FAILOVER to a replica"); + return 1; + } else if (myself->slaveof == NULL) { + addReplyError(c,"I'm a replica but my master is unknown to me"); + return 1; + } else if (!force && + (nodeFailed(myself->slaveof) || + myself->slaveof->link == NULL)) + { + addReplyError(c,"Master is down or failed, " + "please use CLUSTER FAILOVER FORCE"); + return 1; + } + resetManualFailover(); + server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT; + + if (takeover) { + /* A takeover does not perform any initial check. It just + * generates a new configuration epoch for this node without + * consensus, claims the master's slots, and broadcast the new + * configuration. */ + serverLog(LL_NOTICE,"Taking over the master (user request)."); + clusterBumpConfigEpochWithoutConsensus(); + clusterFailoverReplaceYourMaster(); + } else if (force) { + /* If this is a forced failover, we don't need to talk with our + * master to agree about the offset. We just failover taking over + * it without coordination. */ + serverLog(LL_NOTICE,"Forced failover user request accepted."); + server.cluster->mf_can_start = 1; + } else { + serverLog(LL_NOTICE,"Manual failover user request accepted."); + clusterSendMFStart(myself->slaveof); + } + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"set-config-epoch") && c->argc == 3) + { + /* CLUSTER SET-CONFIG-EPOCH + * + * The user is allowed to set the config epoch only when a node is + * totally fresh: no config epoch, no other known node, and so forth. + * This happens at cluster creation time to start with a cluster where + * every node has a different node ID, without to rely on the conflicts + * resolution system which is too slow when a big cluster is created. */ + long long epoch; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&epoch,NULL) != C_OK) + return 1; + + if (epoch < 0) { + addReplyErrorFormat(c,"Invalid config epoch specified: %lld",epoch); + } else if (dictSize(server.cluster->nodes) > 1) { + addReplyError(c,"The user can assign a config epoch only when the " + "node does not know any other node."); + } else if (myself->configEpoch != 0) { + addReplyError(c,"Node config epoch is already non-zero"); + } else { + myself->configEpoch = epoch; + serverLog(LL_NOTICE, + "configEpoch set to %llu via CLUSTER SET-CONFIG-EPOCH", + (unsigned long long) myself->configEpoch); + + if (server.cluster->currentEpoch < (uint64_t)epoch) + server.cluster->currentEpoch = epoch; + /* No need to fsync the config here since in the unlucky event + * of a failure to persist the config, the conflict resolution code + * will assign a unique config to this node. */ + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } + } else if (!strcasecmp(c->argv[1]->ptr,"reset") && + (c->argc == 2 || c->argc == 3)) + { + /* CLUSTER RESET [SOFT|HARD] */ + int hard = 0; + + /* Parse soft/hard argument. Default is soft. */ + if (c->argc == 3) { + if (!strcasecmp(c->argv[2]->ptr,"hard")) { + hard = 1; + } else if (!strcasecmp(c->argv[2]->ptr,"soft")) { + hard = 0; + } else { + addReplyErrorObject(c,shared.syntaxerr); + return 1; + } + } + + /* Slaves can be reset while containing data, but not master nodes + * that must be empty. */ + if (clusterNodeIsMaster(myself) && kvstoreSize(c->db->keys) != 0) { + addReplyError(c,"CLUSTER RESET can't be called with " + "master nodes containing keys"); + return 1; + } + clusterReset(hard); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"links") && c->argc == 2) { + /* CLUSTER LINKS */ + addReplyClusterLinksDescription(c); + } else { + return 0; + } + + return 1; +} + +const char **clusterCommandExtendedHelp(void) { + static const char *help[] = { + "ADDSLOTS [ ...]", + " Assign slots to current node.", + "ADDSLOTSRANGE [ ...]", + " Assign slots which are between and to current node.", + "BUMPEPOCH", + " Advance the cluster config epoch.", + "COUNT-FAILURE-REPORTS ", + " Return number of failure reports for .", + "DELSLOTS [ ...]", + " Delete slots information from current node.", + "DELSLOTSRANGE [ ...]", + " Delete slots information which are between and from current node.", + "FAILOVER [FORCE|TAKEOVER]", + " Promote current replica node to being a master.", + "FORGET ", + " Remove a node from the cluster.", + "FLUSHSLOTS", + " Delete current node own slots information.", + "MEET []", + " Connect nodes into a working cluster.", + "REPLICATE ", + " Configure current node as replica to .", + "RESET [HARD|SOFT]", + " Reset current node (default: soft).", + "SET-CONFIG-EPOCH ", + " Set config epoch of current node.", + "SETSLOT (IMPORTING |MIGRATING |STABLE|NODE )", + " Set slot state.", + "SAVECONFIG", + " Force saving cluster configuration on disk.", + "LINKS", + " Return information about all network links between this node and its peers.", + " Output format is an array where each array element is a map containing attributes of a link", + NULL + }; + + return help; +} + +int clusterNodeNumSlaves(clusterNode *node) { + return node->numslaves; +} + +clusterNode *clusterNodeGetSlave(clusterNode *node, int slave_idx) { + return node->slaves[slave_idx]; +} + +clusterNode *getMigratingSlotDest(int slot) { + return server.cluster->migrating_slots_to[slot]; +} + +clusterNode *getImportingSlotSource(int slot) { + return server.cluster->importing_slots_from[slot]; +} + +int isClusterHealthy(void) { + return server.cluster->state == CLUSTER_OK; +} + +clusterNode *getNodeBySlot(int slot) { + return server.cluster->slots[slot]; +} + +char *clusterNodeHostname(clusterNode *node) { + return node->hostname; +} + +long long clusterNodeReplOffset(clusterNode *node) { + return node->repl_offset; +} + +const char *clusterNodePreferredEndpoint(clusterNode *n) { + char *hostname = clusterNodeHostname(n); + switch (server.cluster_preferred_endpoint_type) { + case CLUSTER_ENDPOINT_TYPE_IP: + return clusterNodeIp(n); + case CLUSTER_ENDPOINT_TYPE_HOSTNAME: + return (hostname != NULL && hostname[0] != '\0') ? hostname : "?"; + case CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT: + return ""; + } + return "unknown"; +} + +int clusterAllowFailoverCmd(client *c) { + if (!server.cluster_enabled) { + return 1; + } + addReplyError(c,"FAILOVER not allowed in cluster mode. " + "Use CLUSTER FAILOVER command instead."); + return 0; +} + +void clusterPromoteSelfToMaster(void) { + replicationUnsetMaster(); +} diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h new file mode 100644 index 00000000000..a857184ab3e --- /dev/null +++ b/src/cluster_legacy.h @@ -0,0 +1,359 @@ +#ifndef CLUSTER_LEGACY_H +#define CLUSTER_LEGACY_H + +#define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ + +/* The following defines are amount of time, sometimes expressed as + * multiplicators of the node timeout value (when ending with MULT). */ +#define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ +#define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ +#define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */ +#define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */ +#define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */ + +/* Reasons why a slave is not able to failover. */ +#define CLUSTER_CANT_FAILOVER_NONE 0 +#define CLUSTER_CANT_FAILOVER_DATA_AGE 1 +#define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2 +#define CLUSTER_CANT_FAILOVER_EXPIRED 3 +#define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4 +#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */ + +/* clusterState todo_before_sleep flags. */ +#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) +#define CLUSTER_TODO_UPDATE_STATE (1<<1) +#define CLUSTER_TODO_SAVE_CONFIG (1<<2) +#define CLUSTER_TODO_FSYNC_CONFIG (1<<3) +#define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1<<4) + +/* clusterLink encapsulates everything needed to talk with a remote node. */ +typedef struct clusterLink { + mstime_t ctime; /* Link creation time */ + connection *conn; /* Connection to remote node */ + list *send_msg_queue; /* List of messages to be sent */ + size_t head_msg_send_offset; /* Number of bytes already sent of message at head of queue */ + unsigned long long send_msg_queue_mem; /* Memory in bytes used by message queue */ + char *rcvbuf; /* Packet reception buffer */ + size_t rcvbuf_len; /* Used size of rcvbuf */ + size_t rcvbuf_alloc; /* Allocated size of rcvbuf */ + clusterNode *node; /* Node related to this link. Initialized to NULL when unknown */ + int inbound; /* 1 if this link is an inbound link accepted from the related node */ +} clusterLink; + +/* Cluster node flags and macros. */ +#define CLUSTER_NODE_MASTER 1 /* The node is a master */ +#define CLUSTER_NODE_SLAVE 2 /* The node is a slave */ +#define CLUSTER_NODE_PFAIL 4 /* Failure? Need acknowledge */ +#define CLUSTER_NODE_FAIL 8 /* The node is believed to be malfunctioning */ +#define CLUSTER_NODE_MYSELF 16 /* This node is myself */ +#define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */ +#define CLUSTER_NODE_NOADDR 64 /* We don't know the address of this node */ +#define CLUSTER_NODE_MEET 128 /* Send a MEET message to this node */ +#define CLUSTER_NODE_MIGRATE_TO 256 /* Master eligible for replica migration. */ +#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failover. */ +#define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" + +#define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE) +#define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) +#define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) +#define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL) +#define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) +#define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) + +/* This structure represent elements of node->fail_reports. */ +typedef struct clusterNodeFailReport { + clusterNode *node; /* Node reporting the failure condition. */ + mstime_t time; /* Time of the last report from this node. */ +} clusterNodeFailReport; + +/* Redis cluster messages header */ + +/* Message types. + * + * Note that the PING, PONG and MEET messages are actually the same exact + * kind of packet. PONG is the reply to ping, in the exact format as a PING, + * while MEET is a special PING that forces the receiver to add the sender + * as a node (if it is not already in the list). */ +#define CLUSTERMSG_TYPE_PING 0 /* Ping */ +#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ +#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ +#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ +#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */ +#define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */ +#define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ +#define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */ +#define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */ +#define CLUSTERMSG_TYPE_COUNT 11 /* Total number of message types. */ + +/* Initially we don't know our "name", but we'll find it once we connect + * to the first node, using the getsockname() function. Then we'll use this + * address for all the next messages. */ +typedef struct { + char nodename[CLUSTER_NAMELEN]; + uint32_t ping_sent; + uint32_t pong_received; + char ip[NET_IP_STR_LEN]; /* IP address last time it was seen */ + uint16_t port; /* primary port last time it was seen */ + uint16_t cport; /* cluster port last time it was seen */ + uint16_t flags; /* node->flags copy */ + uint16_t pport; /* secondary port last time it was seen */ + uint16_t notused1; +} clusterMsgDataGossip; + +typedef struct { + char nodename[CLUSTER_NAMELEN]; +} clusterMsgDataFail; + +typedef struct { + uint32_t channel_len; + uint32_t message_len; + unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */ +} clusterMsgDataPublish; + +typedef struct { + uint64_t configEpoch; /* Config epoch of the specified instance. */ + char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */ + unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */ +} clusterMsgDataUpdate; + +typedef struct { + uint64_t module_id; /* ID of the sender module. */ + uint32_t len; /* ID of the sender module. */ + uint8_t type; /* Type from 0 to 255. */ + unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */ +} clusterMsgModule; + +/* The cluster supports optional extension messages that can be sent + * along with ping/pong/meet messages to give additional info in a + * consistent manner. */ +typedef enum { + CLUSTERMSG_EXT_TYPE_HOSTNAME, + CLUSTERMSG_EXT_TYPE_HUMAN_NODENAME, + CLUSTERMSG_EXT_TYPE_FORGOTTEN_NODE, + CLUSTERMSG_EXT_TYPE_SHARDID, +} clusterMsgPingtypes; + +/* Helper function for making sure extensions are eight byte aligned. */ +#define EIGHT_BYTE_ALIGN(size) ((((size) + 7) / 8) * 8) + +typedef struct { + char hostname[1]; /* The announced hostname, ends with \0. */ +} clusterMsgPingExtHostname; + +typedef struct { + char human_nodename[1]; /* The announced nodename, ends with \0. */ +} clusterMsgPingExtHumanNodename; + +typedef struct { + char name[CLUSTER_NAMELEN]; /* Node name. */ + uint64_t ttl; /* Remaining time to blacklist the node, in seconds. */ +} clusterMsgPingExtForgottenNode; + +static_assert(sizeof(clusterMsgPingExtForgottenNode) % 8 == 0, ""); + +typedef struct { + char shard_id[CLUSTER_NAMELEN]; /* The shard_id, 40 bytes fixed. */ +} clusterMsgPingExtShardId; + +typedef struct { + uint32_t length; /* Total length of this extension message (including this header) */ + uint16_t type; /* Type of this extension message (see clusterMsgPingExtTypes) */ + uint16_t unused; /* 16 bits of padding to make this structure 8 byte aligned. */ + union { + clusterMsgPingExtHostname hostname; + clusterMsgPingExtHumanNodename human_nodename; + clusterMsgPingExtForgottenNode forgotten_node; + clusterMsgPingExtShardId shard_id; + } ext[]; /* Actual extension information, formatted so that the data is 8 + * byte aligned, regardless of its content. */ +} clusterMsgPingExt; + +union clusterMsgData { + /* PING, MEET and PONG */ + struct { + /* Array of N clusterMsgDataGossip structures */ + clusterMsgDataGossip gossip[1]; + /* Extension data that can optionally be sent for ping/meet/pong + * messages. We can't explicitly define them here though, since + * the gossip array isn't the real length of the gossip data. */ + } ping; + + /* FAIL */ + struct { + clusterMsgDataFail about; + } fail; + + /* PUBLISH */ + struct { + clusterMsgDataPublish msg; + } publish; + + /* UPDATE */ + struct { + clusterMsgDataUpdate nodecfg; + } update; + + /* MODULE */ + struct { + clusterMsgModule msg; + } module; +}; + +#define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */ + +typedef struct { + char sig[4]; /* Signature "RCmb" (Redis Cluster message bus). */ + uint32_t totlen; /* Total length of this message */ + uint16_t ver; /* Protocol version, currently set to 1. */ + uint16_t port; /* Primary port number (TCP or TLS). */ + uint16_t type; /* Message type */ + uint16_t count; /* Only used for some kind of messages. */ + uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ + uint64_t configEpoch; /* The config epoch if it's a master, or the last + epoch advertised by its master if it is a + slave. */ + uint64_t offset; /* Master replication offset if node is a master or + processed replication offset if node is a slave. */ + char sender[CLUSTER_NAMELEN]; /* Name of the sender node */ + unsigned char myslots[CLUSTER_SLOTS/8]; + char slaveof[CLUSTER_NAMELEN]; + char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */ + uint16_t extensions; /* Number of extensions sent along with this packet. */ + char notused1[30]; /* 30 bytes reserved for future usage. */ + uint16_t pport; /* Secondary port number: if primary port is TCP port, this is + TLS port, and if primary port is TLS port, this is TCP port.*/ + uint16_t cport; /* Sender TCP cluster bus port */ + uint16_t flags; /* Sender node flags */ + unsigned char state; /* Cluster state from the POV of the sender */ + unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */ + union clusterMsgData data; +} clusterMsg; + +/* clusterMsg defines the gossip wire protocol exchanged among Redis cluster + * members, which can be running different versions of redis-server bits, + * especially during cluster rolling upgrades. + * + * Therefore, fields in this struct should remain at the same offset from + * release to release. The static asserts below ensures that incompatible + * changes in clusterMsg be caught at compile time. + */ + +static_assert(offsetof(clusterMsg, sig) == 0, "unexpected field offset"); +static_assert(offsetof(clusterMsg, totlen) == 4, "unexpected field offset"); +static_assert(offsetof(clusterMsg, ver) == 8, "unexpected field offset"); +static_assert(offsetof(clusterMsg, port) == 10, "unexpected field offset"); +static_assert(offsetof(clusterMsg, type) == 12, "unexpected field offset"); +static_assert(offsetof(clusterMsg, count) == 14, "unexpected field offset"); +static_assert(offsetof(clusterMsg, currentEpoch) == 16, "unexpected field offset"); +static_assert(offsetof(clusterMsg, configEpoch) == 24, "unexpected field offset"); +static_assert(offsetof(clusterMsg, offset) == 32, "unexpected field offset"); +static_assert(offsetof(clusterMsg, sender) == 40, "unexpected field offset"); +static_assert(offsetof(clusterMsg, myslots) == 80, "unexpected field offset"); +static_assert(offsetof(clusterMsg, slaveof) == 2128, "unexpected field offset"); +static_assert(offsetof(clusterMsg, myip) == 2168, "unexpected field offset"); +static_assert(offsetof(clusterMsg, extensions) == 2214, "unexpected field offset"); +static_assert(offsetof(clusterMsg, notused1) == 2216, "unexpected field offset"); +static_assert(offsetof(clusterMsg, pport) == 2246, "unexpected field offset"); +static_assert(offsetof(clusterMsg, cport) == 2248, "unexpected field offset"); +static_assert(offsetof(clusterMsg, flags) == 2250, "unexpected field offset"); +static_assert(offsetof(clusterMsg, state) == 2252, "unexpected field offset"); +static_assert(offsetof(clusterMsg, mflags) == 2253, "unexpected field offset"); +static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset"); + +#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) + +/* Message flags better specify the packet content or are used to + * provide some information about the node state. */ +#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */ +#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if + master is up. */ +#define CLUSTERMSG_FLAG0_EXT_DATA (1<<2) /* Message contains extension data */ + +struct _clusterNode { + mstime_t ctime; /* Node object creation time. */ + char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ + char shard_id[CLUSTER_NAMELEN]; /* shard id, hex string, sha1-size */ + int flags; /* CLUSTER_NODE_... */ + uint64_t configEpoch; /* Last configEpoch observed for this node */ + unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */ + uint16_t *slot_info_pairs; /* Slots info represented as (start/end) pair (consecutive index). */ + int slot_info_pairs_count; /* Used number of slots in slot_info_pairs */ + int numslots; /* Number of slots handled by this node */ + int numslaves; /* Number of slave nodes, if this is a master */ + clusterNode **slaves; /* pointers to slave nodes */ + clusterNode *slaveof; /* pointer to the master node. Note that it + may be NULL even if the node is a slave + if we don't have the master node in our + tables. */ + unsigned long long last_in_ping_gossip; /* The number of the last carried in the ping gossip section */ + mstime_t ping_sent; /* Unix time we sent latest ping */ + mstime_t pong_received; /* Unix time we received the pong */ + mstime_t data_received; /* Unix time we received any data */ + mstime_t fail_time; /* Unix time when FAIL flag was set */ + mstime_t voted_time; /* Last time we voted for a slave of this master */ + mstime_t repl_offset_time; /* Unix time we received offset for this node */ + mstime_t orphaned_time; /* Starting time of orphaned master condition */ + long long repl_offset; /* Last known repl offset for this node. */ + char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ + sds hostname; /* The known hostname for this node */ + sds human_nodename; /* The known human readable nodename for this node */ + int tcp_port; /* Latest known clients TCP port. */ + int tls_port; /* Latest known clients TLS port */ + int cport; /* Latest known cluster port of this node. */ + clusterLink *link; /* TCP/IP link established toward this node */ + clusterLink *inbound_link; /* TCP/IP link accepted from this node */ + list *fail_reports; /* List of nodes signaling this as failing */ +}; + +struct clusterState { + clusterNode *myself; /* This node */ + uint64_t currentEpoch; + int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ + int size; /* Num of master nodes with at least one slot */ + dict *nodes; /* Hash table of name -> clusterNode structures */ + dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ + dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */ + clusterNode *migrating_slots_to[CLUSTER_SLOTS]; + clusterNode *importing_slots_from[CLUSTER_SLOTS]; + clusterNode *slots[CLUSTER_SLOTS]; + /* The following fields are used to take the slave state on elections. */ + mstime_t failover_auth_time; /* Time of previous or next election. */ + int failover_auth_count; /* Number of votes received so far. */ + int failover_auth_sent; /* True if we already asked for votes. */ + int failover_auth_rank; /* This slave rank for current auth request. */ + uint64_t failover_auth_epoch; /* Epoch of the current election. */ + int cant_failover_reason; /* Why a slave is currently not able to + failover. See the CANT_FAILOVER_* macros. */ + /* Manual failover state in common. */ + mstime_t mf_end; /* Manual failover time limit (ms unixtime). + It is zero if there is no MF in progress. */ + /* Manual failover state of master. */ + clusterNode *mf_slave; /* Slave performing the manual failover. */ + /* Manual failover state of slave. */ + long long mf_master_offset; /* Master offset the slave needs to start MF + or -1 if still not received. */ + int mf_can_start; /* If non-zero signal that the manual failover + can start requesting masters vote. */ + /* The following fields are used by masters to take state on elections. */ + uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */ + int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ + /* Stats */ + /* Messages received and sent by type. */ + long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT]; + long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT]; + long long stats_pfail_nodes; /* Number of nodes in PFAIL status, + excluding nodes without address. */ + unsigned long long stat_cluster_links_buffer_limit_exceeded; /* Total number of cluster links freed due to exceeding buffer limit */ + + /* Bit map for slots that are no longer claimed by the owner in cluster PING + * messages. During slot migration, the owner will stop claiming the slot after + * the ownership transfer. Set the bit corresponding to the slot when a node + * stops claiming the slot. This prevents spreading incorrect information (that + * source still owns the slot) using UPDATE messages. */ + unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8]; +}; + + +#endif //CLUSTER_LEGACY_H diff --git a/src/commands.def b/src/commands.def index 7e575648c5e..ff8b81d4188 100644 --- a/src/commands.def +++ b/src/commands.def @@ -964,14 +964,14 @@ struct COMMAND_STRUCT CLUSTER_Subcommands[] = { {MAKE_CMD("myid","Returns the ID of a node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MYID_History,0,CLUSTER_MYID_Tips,0,clusterCommand,2,CMD_STALE,0,CLUSTER_MYID_Keyspecs,0,NULL,0)}, {MAKE_CMD("myshardid","Returns the shard ID of a node.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MYSHARDID_History,0,CLUSTER_MYSHARDID_Tips,1,clusterCommand,2,CMD_STALE,0,CLUSTER_MYSHARDID_Keyspecs,0,NULL,0)}, {MAKE_CMD("nodes","Returns the cluster configuration for a node.","O(N) where N is the total number of Cluster nodes","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_NODES_History,0,CLUSTER_NODES_Tips,1,clusterCommand,2,CMD_STALE,0,CLUSTER_NODES_Keyspecs,0,NULL,0)}, -{MAKE_CMD("replicas","Lists the replica nodes of a master node.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_REPLICAS_History,0,CLUSTER_REPLICAS_Tips,1,clusterCommand,3,CMD_ADMIN|CMD_STALE,0,CLUSTER_REPLICAS_Keyspecs,0,NULL,1),.args=CLUSTER_REPLICAS_Args}, +{MAKE_CMD("replicas","Lists the replica nodes of a master node.","O(N) where N is the number of replicas.","5.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_REPLICAS_History,0,CLUSTER_REPLICAS_Tips,1,clusterCommand,3,CMD_ADMIN|CMD_STALE,0,CLUSTER_REPLICAS_Keyspecs,0,NULL,1),.args=CLUSTER_REPLICAS_Args}, {MAKE_CMD("replicate","Configure a node as replica of a master node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_REPLICATE_History,0,CLUSTER_REPLICATE_Tips,0,clusterCommand,3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_REPLICATE_Keyspecs,0,NULL,1),.args=CLUSTER_REPLICATE_Args}, {MAKE_CMD("reset","Resets a node.","O(N) where N is the number of known nodes. The command may execute a FLUSHALL as a side effect.","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_RESET_History,0,CLUSTER_RESET_Tips,0,clusterCommand,-2,CMD_ADMIN|CMD_STALE|CMD_NOSCRIPT,0,CLUSTER_RESET_Keyspecs,0,NULL,1),.args=CLUSTER_RESET_Args}, {MAKE_CMD("saveconfig","Forces a node to save the cluster configuration to disk.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SAVECONFIG_History,0,CLUSTER_SAVECONFIG_Tips,0,clusterCommand,2,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_SAVECONFIG_Keyspecs,0,NULL,0)}, {MAKE_CMD("set-config-epoch","Sets the configuration epoch for a new node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SET_CONFIG_EPOCH_History,0,CLUSTER_SET_CONFIG_EPOCH_Tips,0,clusterCommand,3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_SET_CONFIG_EPOCH_Keyspecs,0,NULL,1),.args=CLUSTER_SET_CONFIG_EPOCH_Args}, {MAKE_CMD("setslot","Binds a hash slot to a node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SETSLOT_History,0,CLUSTER_SETSLOT_Tips,0,clusterCommand,-4,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_SETSLOT_Keyspecs,0,NULL,2),.args=CLUSTER_SETSLOT_Args}, {MAKE_CMD("shards","Returns the mapping of cluster slots to shards.","O(N) where N is the total number of cluster nodes","7.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SHARDS_History,0,CLUSTER_SHARDS_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_SHARDS_Keyspecs,0,NULL,0)}, -{MAKE_CMD("slaves","Lists the replica nodes of a master node.","O(1)","3.0.0",CMD_DOC_DEPRECATED,"`CLUSTER REPLICAS`","5.0.0","cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SLAVES_History,0,CLUSTER_SLAVES_Tips,1,clusterCommand,3,CMD_ADMIN|CMD_STALE,0,CLUSTER_SLAVES_Keyspecs,0,NULL,1),.args=CLUSTER_SLAVES_Args}, +{MAKE_CMD("slaves","Lists the replica nodes of a master node.","O(N) where N is the number of replicas.","3.0.0",CMD_DOC_DEPRECATED,"`CLUSTER REPLICAS`","5.0.0","cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SLAVES_History,0,CLUSTER_SLAVES_Tips,1,clusterCommand,3,CMD_ADMIN|CMD_STALE,0,CLUSTER_SLAVES_Keyspecs,0,NULL,1),.args=CLUSTER_SLAVES_Args}, {MAKE_CMD("slots","Returns the mapping of cluster slots to nodes.","O(N) where N is the total number of Cluster nodes","3.0.0",CMD_DOC_DEPRECATED,"`CLUSTER SHARDS`","7.0.0","cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SLOTS_History,2,CLUSTER_SLOTS_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_SLOTS_Keyspecs,0,NULL,0)}, {0} }; @@ -1177,6 +1177,7 @@ commandHistory CLIENT_KILL_History[] = { {"3.2.0","Added `master` type in for `TYPE` option."}, {"5.0.0","Replaced `slave` `TYPE` with `replica`. `slave` still supported for backward compatibility."}, {"6.2.0","`LADDR` option."}, +{"7.4.0","`MAXAGE` option."}, }; #endif @@ -1213,12 +1214,13 @@ struct COMMAND_ARG CLIENT_KILL_filter_new_format_Subargs[] = { {MAKE_ARG("addr",ARG_TYPE_STRING,-1,"ADDR",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL),.display_text="ip:port"}, {MAKE_ARG("laddr",ARG_TYPE_STRING,-1,"LADDR",NULL,"6.2.0",CMD_ARG_OPTIONAL,0,NULL),.display_text="ip:port"}, {MAKE_ARG("skipme",ARG_TYPE_ONEOF,-1,"SKIPME",NULL,NULL,CMD_ARG_OPTIONAL,2,NULL),.subargs=CLIENT_KILL_filter_new_format_skipme_Subargs}, +{MAKE_ARG("maxage",ARG_TYPE_INTEGER,-1,"MAXAGE",NULL,"7.4.0",CMD_ARG_OPTIONAL,0,NULL)}, }; /* CLIENT KILL filter argument table */ struct COMMAND_ARG CLIENT_KILL_filter_Subargs[] = { {MAKE_ARG("old-format",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,"2.8.12"),.display_text="ip:port"}, -{MAKE_ARG("new-format",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,6,NULL),.subargs=CLIENT_KILL_filter_new_format_Subargs}, +{MAKE_ARG("new-format",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,7,NULL),.subargs=CLIENT_KILL_filter_new_format_Subargs}, }; /* CLIENT KILL argument table */ @@ -1391,7 +1393,10 @@ struct COMMAND_ARG CLIENT_REPLY_Args[] = { #ifndef SKIP_CMD_TIPS_TABLE /* CLIENT SETINFO tips */ -#define CLIENT_SETINFO_Tips NULL +const char *CLIENT_SETINFO_Tips[] = { +"request_policy:all_nodes", +"response_policy:all_succeeded", +}; #endif #ifndef SKIP_CMD_KEY_SPECS_TABLE @@ -1419,7 +1424,10 @@ struct COMMAND_ARG CLIENT_SETINFO_Args[] = { #ifndef SKIP_CMD_TIPS_TABLE /* CLIENT SETNAME tips */ -#define CLIENT_SETNAME_Tips NULL +const char *CLIENT_SETNAME_Tips[] = { +"request_policy:all_nodes", +"response_policy:all_succeeded", +}; #endif #ifndef SKIP_CMD_KEY_SPECS_TABLE @@ -1537,14 +1545,14 @@ struct COMMAND_STRUCT CLIENT_Subcommands[] = { {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_HELP_History,0,CLIENT_HELP_Tips,0,clientCommand,2,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_HELP_Keyspecs,0,NULL,0)}, {MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)}, {MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)}, -{MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,5,CLIENT_KILL_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args}, +{MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,6,CLIENT_KILL_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args}, {MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,6,CLIENT_LIST_Tips,1,clientCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,2),.args=CLIENT_LIST_Args}, {MAKE_CMD("no-evict","Sets the client eviction mode of the connection.","O(1)","7.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_EVICT_History,0,CLIENT_NO_EVICT_Tips,0,clientCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_NO_EVICT_Keyspecs,0,NULL,1),.args=CLIENT_NO_EVICT_Args}, {MAKE_CMD("no-touch","Controls whether commands sent by the client affect the LRU/LFU of accessed keys.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_TOUCH_History,0,CLIENT_NO_TOUCH_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_NO_TOUCH_Keyspecs,0,NULL,1),.args=CLIENT_NO_TOUCH_Args}, {MAKE_CMD("pause","Suspends commands processing.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_PAUSE_History,1,CLIENT_PAUSE_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_PAUSE_Keyspecs,0,NULL,2),.args=CLIENT_PAUSE_Args}, {MAKE_CMD("reply","Instructs the server whether to reply to commands.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_REPLY_History,0,CLIENT_REPLY_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_REPLY_Keyspecs,0,NULL,1),.args=CLIENT_REPLY_Args}, -{MAKE_CMD("setinfo","Sets information specific to the client or connection.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETINFO_History,0,CLIENT_SETINFO_Tips,0,clientSetinfoCommand,4,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETINFO_Keyspecs,0,NULL,1),.args=CLIENT_SETINFO_Args}, -{MAKE_CMD("setname","Sets the connection name.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETNAME_History,0,CLIENT_SETNAME_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETNAME_Keyspecs,0,NULL,1),.args=CLIENT_SETNAME_Args}, +{MAKE_CMD("setinfo","Sets information specific to the client or connection.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETINFO_History,0,CLIENT_SETINFO_Tips,2,clientSetinfoCommand,4,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETINFO_Keyspecs,0,NULL,1),.args=CLIENT_SETINFO_Args}, +{MAKE_CMD("setname","Sets the connection name.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETNAME_History,0,CLIENT_SETNAME_Tips,2,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETNAME_Keyspecs,0,NULL,1),.args=CLIENT_SETNAME_Args}, {MAKE_CMD("tracking","Controls server-assisted client-side caching for the connection.","O(1). Some options may introduce additional complexity.","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKING_History,0,CLIENT_TRACKING_Tips,0,clientCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKING_Keyspecs,0,NULL,7),.args=CLIENT_TRACKING_Args}, {MAKE_CMD("trackinginfo","Returns information about server-assisted client-side caching for the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKINGINFO_History,0,CLIENT_TRACKINGINFO_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKINGINFO_Keyspecs,0,NULL,0)}, {MAKE_CMD("unblock","Unblocks a client blocked by a blocking command from a different connection.","O(log N) where N is the number of client connections","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNBLOCK_History,0,CLIENT_UNBLOCK_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNBLOCK_Keyspecs,0,NULL,2),.args=CLIENT_UNBLOCK_Args}, @@ -2328,6 +2336,7 @@ struct COMMAND_ARG PTTL_Args[] = { /* RANDOMKEY tips */ const char *RANDOMKEY_Tips[] = { "request_policy:all_shards", +"response_policy:special", "nondeterministic_output", }; #endif @@ -2437,6 +2446,7 @@ commandHistory SCAN_History[] = { const char *SCAN_Tips[] = { "nondeterministic_output", "request_policy:special", +"response_policy:special", }; #endif @@ -2890,6 +2900,7 @@ struct COMMAND_ARG GEORADIUS_Args[] = { #ifndef SKIP_CMD_HISTORY_TABLE /* GEORADIUSBYMEMBER history */ commandHistory GEORADIUSBYMEMBER_History[] = { +{"6.2.0","Added the `ANY` option for `COUNT`."}, {"7.0.0","Added support for uppercase unit names."}, }; #endif @@ -2950,7 +2961,10 @@ struct COMMAND_ARG GEORADIUSBYMEMBER_Args[] = { #ifndef SKIP_CMD_HISTORY_TABLE /* GEORADIUSBYMEMBER_RO history */ -#define GEORADIUSBYMEMBER_RO_History NULL +commandHistory GEORADIUSBYMEMBER_RO_History[] = { +{"6.2.0","Added the `ANY` option for `COUNT`."}, +{"7.0.0","Added support for uppercase unit names."}, +}; #endif #ifndef SKIP_CMD_TIPS_TABLE @@ -3004,6 +3018,7 @@ struct COMMAND_ARG GEORADIUSBYMEMBER_RO_Args[] = { /* GEORADIUS_RO history */ commandHistory GEORADIUS_RO_History[] = { {"6.2.0","Added the `ANY` option for `COUNT`."}, +{"7.0.0","Added support for uppercase unit names."}, }; #endif @@ -3288,6 +3303,119 @@ struct COMMAND_ARG HEXISTS_Args[] = { {MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, }; +/********** HEXPIRE ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HEXPIRE history */ +#define HEXPIRE_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HEXPIRE tips */ +#define HEXPIRE_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HEXPIRE key specs */ +keySpec HEXPIRE_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HEXPIRE condition argument table */ +struct COMMAND_ARG HEXPIRE_condition_Subargs[] = { +{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("gt",ARG_TYPE_PURE_TOKEN,-1,"GT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("lt",ARG_TYPE_PURE_TOKEN,-1,"LT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* HEXPIRE fields argument table */ +struct COMMAND_ARG HEXPIRE_fields_Subargs[] = { +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/* HEXPIRE argument table */ +struct COMMAND_ARG HEXPIRE_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("seconds",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,4,NULL),.subargs=HEXPIRE_condition_Subargs}, +{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HEXPIRE_fields_Subargs}, +}; + +/********** HEXPIREAT ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HEXPIREAT history */ +#define HEXPIREAT_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HEXPIREAT tips */ +#define HEXPIREAT_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HEXPIREAT key specs */ +keySpec HEXPIREAT_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HEXPIREAT condition argument table */ +struct COMMAND_ARG HEXPIREAT_condition_Subargs[] = { +{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("gt",ARG_TYPE_PURE_TOKEN,-1,"GT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("lt",ARG_TYPE_PURE_TOKEN,-1,"LT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* HEXPIREAT fields argument table */ +struct COMMAND_ARG HEXPIREAT_fields_Subargs[] = { +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/* HEXPIREAT argument table */ +struct COMMAND_ARG HEXPIREAT_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("unix-time-seconds",ARG_TYPE_UNIX_TIME,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,4,NULL),.subargs=HEXPIREAT_condition_Subargs}, +{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HEXPIREAT_fields_Subargs}, +}; + +/********** HEXPIRETIME ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HEXPIRETIME history */ +#define HEXPIRETIME_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HEXPIRETIME tips */ +#define HEXPIRETIME_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HEXPIRETIME key specs */ +keySpec HEXPIRETIME_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HEXPIRETIME fields argument table */ +struct COMMAND_ARG HEXPIRETIME_fields_Subargs[] = { +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/* HEXPIRETIME argument table */ +struct COMMAND_ARG HEXPIRETIME_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HEXPIRETIME_fields_Subargs}, +}; + /********** HGET ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -3497,6 +3625,181 @@ struct COMMAND_ARG HMSET_Args[] = { {MAKE_ARG("data",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,2,NULL),.subargs=HMSET_data_Subargs}, }; +/********** HPERSIST ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HPERSIST history */ +#define HPERSIST_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HPERSIST tips */ +#define HPERSIST_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HPERSIST key specs */ +keySpec HPERSIST_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HPERSIST fields argument table */ +struct COMMAND_ARG HPERSIST_fields_Subargs[] = { +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/* HPERSIST argument table */ +struct COMMAND_ARG HPERSIST_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HPERSIST_fields_Subargs}, +}; + +/********** HPEXPIRE ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HPEXPIRE history */ +#define HPEXPIRE_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HPEXPIRE tips */ +#define HPEXPIRE_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HPEXPIRE key specs */ +keySpec HPEXPIRE_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HPEXPIRE condition argument table */ +struct COMMAND_ARG HPEXPIRE_condition_Subargs[] = { +{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("gt",ARG_TYPE_PURE_TOKEN,-1,"GT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("lt",ARG_TYPE_PURE_TOKEN,-1,"LT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* HPEXPIRE fields argument table */ +struct COMMAND_ARG HPEXPIRE_fields_Subargs[] = { +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/* HPEXPIRE argument table */ +struct COMMAND_ARG HPEXPIRE_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("milliseconds",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,4,NULL),.subargs=HPEXPIRE_condition_Subargs}, +{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HPEXPIRE_fields_Subargs}, +}; + +/********** HPEXPIREAT ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HPEXPIREAT history */ +#define HPEXPIREAT_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HPEXPIREAT tips */ +#define HPEXPIREAT_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HPEXPIREAT key specs */ +keySpec HPEXPIREAT_Keyspecs[1] = { +{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HPEXPIREAT condition argument table */ +struct COMMAND_ARG HPEXPIREAT_condition_Subargs[] = { +{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("gt",ARG_TYPE_PURE_TOKEN,-1,"GT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("lt",ARG_TYPE_PURE_TOKEN,-1,"LT",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* HPEXPIREAT fields argument table */ +struct COMMAND_ARG HPEXPIREAT_fields_Subargs[] = { +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/* HPEXPIREAT argument table */ +struct COMMAND_ARG HPEXPIREAT_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("unix-time-milliseconds",ARG_TYPE_UNIX_TIME,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,4,NULL),.subargs=HPEXPIREAT_condition_Subargs}, +{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HPEXPIREAT_fields_Subargs}, +}; + +/********** HPEXPIRETIME ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HPEXPIRETIME history */ +#define HPEXPIRETIME_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HPEXPIRETIME tips */ +#define HPEXPIRETIME_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HPEXPIRETIME key specs */ +keySpec HPEXPIRETIME_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HPEXPIRETIME fields argument table */ +struct COMMAND_ARG HPEXPIRETIME_fields_Subargs[] = { +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/* HPEXPIRETIME argument table */ +struct COMMAND_ARG HPEXPIRETIME_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HPEXPIRETIME_fields_Subargs}, +}; + +/********** HPTTL ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HPTTL history */ +#define HPTTL_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HPTTL tips */ +#define HPTTL_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HPTTL key specs */ +keySpec HPTTL_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HPTTL fields argument table */ +struct COMMAND_ARG HPTTL_fields_Subargs[] = { +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/* HPTTL argument table */ +struct COMMAND_ARG HPTTL_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HPTTL_fields_Subargs}, +}; + /********** HRANDFIELD ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -3557,6 +3860,7 @@ struct COMMAND_ARG HSCAN_Args[] = { {MAKE_ARG("cursor",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, {MAKE_ARG("pattern",ARG_TYPE_PATTERN,-1,"MATCH",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)}, {MAKE_ARG("count",ARG_TYPE_INTEGER,-1,"COUNT",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)}, +{MAKE_ARG("novalues",ARG_TYPE_PURE_TOKEN,-1,"NOVALUES",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)}, }; /********** HSET ********************/ @@ -3643,6 +3947,37 @@ struct COMMAND_ARG HSTRLEN_Args[] = { {MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, }; +/********** HTTL ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* HTTL history */ +#define HTTL_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* HTTL tips */ +#define HTTL_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* HTTL key specs */ +keySpec HTTL_Keyspecs[1] = { +{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}} +}; +#endif + +/* HTTL fields argument table */ +struct COMMAND_ARG HTTL_fields_Subargs[] = { +{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)}, +}; + +/* HTTL argument table */ +struct COMMAND_ARG HTTL_Args[] = { +{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HTTL_fields_Subargs}, +}; + /********** HVALS ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -5884,7 +6219,10 @@ struct COMMAND_ARG ACL_CAT_Args[] = { #ifndef SKIP_CMD_TIPS_TABLE /* ACL DELUSER tips */ -#define ACL_DELUSER_Tips NULL +const char *ACL_DELUSER_Tips[] = { +"request_policy:all_nodes", +"response_policy:all_succeeded", +}; #endif #ifndef SKIP_CMD_KEY_SPECS_TABLE @@ -6058,7 +6396,10 @@ struct COMMAND_ARG ACL_LOG_Args[] = { #ifndef SKIP_CMD_TIPS_TABLE /* ACL SAVE tips */ -#define ACL_SAVE_Tips NULL +const char *ACL_SAVE_Tips[] = { +"request_policy:all_nodes", +"response_policy:all_succeeded", +}; #endif #ifndef SKIP_CMD_KEY_SPECS_TABLE @@ -6078,7 +6419,10 @@ commandHistory ACL_SETUSER_History[] = { #ifndef SKIP_CMD_TIPS_TABLE /* ACL SETUSER tips */ -#define ACL_SETUSER_Tips NULL +const char *ACL_SETUSER_Tips[] = { +"request_policy:all_nodes", +"response_policy:all_succeeded", +}; #endif #ifndef SKIP_CMD_KEY_SPECS_TABLE @@ -6129,7 +6473,7 @@ struct COMMAND_ARG ACL_SETUSER_Args[] = { /* ACL command table */ struct COMMAND_STRUCT ACL_Subcommands[] = { {MAKE_CMD("cat","Lists the ACL categories, or the commands inside a category.","O(1) since the categories and commands are a fixed set.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_CAT_History,0,ACL_CAT_Tips,0,aclCommand,-2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_CAT_Keyspecs,0,NULL,1),.args=ACL_CAT_Args}, -{MAKE_CMD("deluser","Deletes ACL users, and terminates their connections.","O(1) amortized time considering the typical user.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_DELUSER_History,0,ACL_DELUSER_Tips,0,aclCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_DELUSER_Keyspecs,0,NULL,1),.args=ACL_DELUSER_Args}, +{MAKE_CMD("deluser","Deletes ACL users, and terminates their connections.","O(1) amortized time considering the typical user.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_DELUSER_History,0,ACL_DELUSER_Tips,2,aclCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_DELUSER_Keyspecs,0,NULL,1),.args=ACL_DELUSER_Args}, {MAKE_CMD("dryrun","Simulates the execution of a command by a user, without executing the command.","O(1).","7.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_DRYRUN_History,0,ACL_DRYRUN_Tips,0,aclCommand,-4,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_DRYRUN_Keyspecs,0,NULL,3),.args=ACL_DRYRUN_Args}, {MAKE_CMD("genpass","Generates a pseudorandom, secure password that can be used to identify ACL users.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_GENPASS_History,0,ACL_GENPASS_Tips,0,aclCommand,-2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_GENPASS_Keyspecs,0,NULL,1),.args=ACL_GENPASS_Args}, {MAKE_CMD("getuser","Lists the ACL rules of a user.","O(N). Where N is the number of password, command and pattern rules that the user has.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_GETUSER_History,2,ACL_GETUSER_Tips,0,aclCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_GETUSER_Keyspecs,0,NULL,1),.args=ACL_GETUSER_Args}, @@ -6137,8 +6481,8 @@ struct COMMAND_STRUCT ACL_Subcommands[] = { {MAKE_CMD("list","Dumps the effective rules in ACL file format.","O(N). Where N is the number of configured users.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_LIST_History,0,ACL_LIST_Tips,0,aclCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_LIST_Keyspecs,0,NULL,0)}, {MAKE_CMD("load","Reloads the rules from the configured ACL file.","O(N). Where N is the number of configured users.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_LOAD_History,0,ACL_LOAD_Tips,0,aclCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_LOAD_Keyspecs,0,NULL,0)}, {MAKE_CMD("log","Lists recent security events generated due to ACL rules.","O(N) with N being the number of entries shown.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_LOG_History,1,ACL_LOG_Tips,0,aclCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_LOG_Keyspecs,0,NULL,1),.args=ACL_LOG_Args}, -{MAKE_CMD("save","Saves the effective ACL rules in the configured ACL file.","O(N). Where N is the number of configured users.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_SAVE_History,0,ACL_SAVE_Tips,0,aclCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_SAVE_Keyspecs,0,NULL,0)}, -{MAKE_CMD("setuser","Creates and modifies an ACL user and its rules.","O(N). Where N is the number of rules provided.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_SETUSER_History,2,ACL_SETUSER_Tips,0,aclCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_SETUSER_Keyspecs,0,NULL,2),.args=ACL_SETUSER_Args}, +{MAKE_CMD("save","Saves the effective ACL rules in the configured ACL file.","O(N). Where N is the number of configured users.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_SAVE_History,0,ACL_SAVE_Tips,2,aclCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_SAVE_Keyspecs,0,NULL,0)}, +{MAKE_CMD("setuser","Creates and modifies an ACL user and its rules.","O(N). Where N is the number of rules provided.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_SETUSER_History,2,ACL_SETUSER_Tips,2,aclCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_SETUSER_Keyspecs,0,NULL,2),.args=ACL_SETUSER_Args}, {MAKE_CMD("users","Lists all ACL users.","O(N). Where N is the number of configured users.","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_USERS_History,0,ACL_USERS_Tips,0,aclCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_USERS_Keyspecs,0,NULL,0)}, {MAKE_CMD("whoami","Returns the authenticated username of the current connection.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ACL_WHOAMI_History,0,ACL_WHOAMI_Tips,0,aclCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,0,ACL_WHOAMI_Keyspecs,0,NULL,0)}, {0} @@ -6444,7 +6788,10 @@ struct COMMAND_ARG CONFIG_GET_Args[] = { #ifndef SKIP_CMD_TIPS_TABLE /* CONFIG RESETSTAT tips */ -#define CONFIG_RESETSTAT_Tips NULL +const char *CONFIG_RESETSTAT_Tips[] = { +"request_policy:all_nodes", +"response_policy:all_succeeded", +}; #endif #ifndef SKIP_CMD_KEY_SPECS_TABLE @@ -6461,7 +6808,10 @@ struct COMMAND_ARG CONFIG_GET_Args[] = { #ifndef SKIP_CMD_TIPS_TABLE /* CONFIG REWRITE tips */ -#define CONFIG_REWRITE_Tips NULL +const char *CONFIG_REWRITE_Tips[] = { +"request_policy:all_nodes", +"response_policy:all_succeeded", +}; #endif #ifndef SKIP_CMD_KEY_SPECS_TABLE @@ -6506,8 +6856,8 @@ struct COMMAND_ARG CONFIG_SET_Args[] = { struct COMMAND_STRUCT CONFIG_Subcommands[] = { {MAKE_CMD("get","Returns the effective values of configuration parameters.","O(N) when N is the number of configuration parameters provided","2.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,CONFIG_GET_History,1,CONFIG_GET_Tips,0,configGetCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,CONFIG_GET_Keyspecs,0,NULL,1),.args=CONFIG_GET_Args}, {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,CONFIG_HELP_History,0,CONFIG_HELP_Tips,0,configHelpCommand,2,CMD_LOADING|CMD_STALE,0,CONFIG_HELP_Keyspecs,0,NULL,0)}, -{MAKE_CMD("resetstat","Resets the server's statistics.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,CONFIG_RESETSTAT_History,0,CONFIG_RESETSTAT_Tips,0,configResetStatCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,CONFIG_RESETSTAT_Keyspecs,0,NULL,0)}, -{MAKE_CMD("rewrite","Persists the effective configuration to file.","O(1)","2.8.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,CONFIG_REWRITE_History,0,CONFIG_REWRITE_Tips,0,configRewriteCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,CONFIG_REWRITE_Keyspecs,0,NULL,0)}, +{MAKE_CMD("resetstat","Resets the server's statistics.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,CONFIG_RESETSTAT_History,0,CONFIG_RESETSTAT_Tips,2,configResetStatCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,CONFIG_RESETSTAT_Keyspecs,0,NULL,0)}, +{MAKE_CMD("rewrite","Persists the effective configuration to file.","O(1)","2.8.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,CONFIG_REWRITE_History,0,CONFIG_REWRITE_Tips,2,configRewriteCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,CONFIG_REWRITE_Keyspecs,0,NULL,0)}, {MAKE_CMD("set","Sets configuration parameters in-flight.","O(N) when N is the number of configuration parameters provided","2.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,CONFIG_SET_History,1,CONFIG_SET_Tips,2,configSetCommand,-4,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,CONFIG_SET_Keyspecs,0,NULL,1),.args=CONFIG_SET_Args}, {0} }; @@ -6860,7 +7210,7 @@ const char *LATENCY_LATEST_Tips[] = { /* LATENCY RESET tips */ const char *LATENCY_RESET_Tips[] = { "request_policy:all_nodes", -"response_policy:all_succeeded", +"response_policy:agg_sum", }; #endif @@ -7290,12 +7640,29 @@ struct COMMAND_ARG PSYNC_Args[] = { #define REPLICAOF_Keyspecs NULL #endif -/* REPLICAOF argument table */ -struct COMMAND_ARG REPLICAOF_Args[] = { +/* REPLICAOF args host_port argument table */ +struct COMMAND_ARG REPLICAOF_args_host_port_Subargs[] = { {MAKE_ARG("host",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, {MAKE_ARG("port",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, }; +/* REPLICAOF args no_one argument table */ +struct COMMAND_ARG REPLICAOF_args_no_one_Subargs[] = { +{MAKE_ARG("no",ARG_TYPE_PURE_TOKEN,-1,"NO",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("one",ARG_TYPE_PURE_TOKEN,-1,"ONE",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* REPLICAOF args argument table */ +struct COMMAND_ARG REPLICAOF_args_Subargs[] = { +{MAKE_ARG("host-port",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=REPLICAOF_args_host_port_Subargs}, +{MAKE_ARG("no-one",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=REPLICAOF_args_no_one_Subargs}, +}; + +/* REPLICAOF argument table */ +struct COMMAND_ARG REPLICAOF_Args[] = { +{MAKE_ARG("args",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=REPLICAOF_args_Subargs}, +}; + /********** RESTORE_ASKING ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -7414,12 +7781,29 @@ struct COMMAND_ARG SHUTDOWN_Args[] = { #define SLAVEOF_Keyspecs NULL #endif -/* SLAVEOF argument table */ -struct COMMAND_ARG SLAVEOF_Args[] = { +/* SLAVEOF args host_port argument table */ +struct COMMAND_ARG SLAVEOF_args_host_port_Subargs[] = { {MAKE_ARG("host",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, {MAKE_ARG("port",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, }; +/* SLAVEOF args no_one argument table */ +struct COMMAND_ARG SLAVEOF_args_no_one_Subargs[] = { +{MAKE_ARG("no",ARG_TYPE_PURE_TOKEN,-1,"NO",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("one",ARG_TYPE_PURE_TOKEN,-1,"ONE",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* SLAVEOF args argument table */ +struct COMMAND_ARG SLAVEOF_args_Subargs[] = { +{MAKE_ARG("host-port",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=SLAVEOF_args_host_port_Subargs}, +{MAKE_ARG("no-one",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=SLAVEOF_args_no_one_Subargs}, +}; + +/* SLAVEOF argument table */ +struct COMMAND_ARG SLAVEOF_Args[] = { +{MAKE_ARG("args",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=SLAVEOF_args_Subargs}, +}; + /********** SLOWLOG GET ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -7759,7 +8143,7 @@ struct COMMAND_ARG SINTERCARD_Args[] = { #ifndef SKIP_CMD_KEY_SPECS_TABLE /* SINTERSTORE key specs */ keySpec SINTERSTORE_Keyspecs[2] = { -{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}},{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={2},KSPEC_FK_RANGE,.fk.range={-1,1,0}} +{NULL,CMD_KEY_OW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}},{NULL,CMD_KEY_RO|CMD_KEY_ACCESS,KSPEC_BS_INDEX,.bs.index={2},KSPEC_FK_RANGE,.fk.range={-1,1,0}} }; #endif @@ -9321,7 +9705,7 @@ struct COMMAND_ARG XGROUP_CREATE_Args[] = { {MAKE_ARG("group",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, {MAKE_ARG("id-selector",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=XGROUP_CREATE_id_selector_Subargs}, {MAKE_ARG("mkstream",ARG_TYPE_PURE_TOKEN,-1,"MKSTREAM",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)}, -{MAKE_ARG("entries-read",ARG_TYPE_INTEGER,-1,"ENTRIESREAD",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)}, +{MAKE_ARG("entriesread",ARG_TYPE_INTEGER,-1,"ENTRIESREAD",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL),.display_text="entries-read"}, }; /********** XGROUP CREATECONSUMER ********************/ @@ -9486,7 +9870,7 @@ struct COMMAND_STRUCT XGROUP_Subcommands[] = { #ifndef SKIP_CMD_HISTORY_TABLE /* XINFO CONSUMERS history */ commandHistory XINFO_CONSUMERS_History[] = { -{"7.2.0","Added the `inactive` field."}, +{"7.2.0","Added the `inactive` field, and changed the meaning of `idle`."}, }; #endif @@ -10263,10 +10647,7 @@ struct COMMAND_ARG MSET_Args[] = { #ifndef SKIP_CMD_TIPS_TABLE /* MSETNX tips */ -const char *MSETNX_Tips[] = { -"request_policy:multi_shard", -"response_policy:agg_min", -}; +#define MSETNX_Tips NULL #endif #ifndef SKIP_CMD_KEY_SPECS_TABLE @@ -10621,33 +11002,36 @@ struct COMMAND_STRUCT redisCommandTable[] = { {MAKE_CMD("pexpireat","Sets the expiration time of a key to a Unix milliseconds timestamp.","O(1)","2.6.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,PEXPIREAT_History,1,PEXPIREAT_Tips,0,pexpireatCommand,-3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_KEYSPACE,PEXPIREAT_Keyspecs,1,NULL,3),.args=PEXPIREAT_Args}, {MAKE_CMD("pexpiretime","Returns the expiration time of a key as a Unix milliseconds timestamp.","O(1)","7.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,PEXPIRETIME_History,0,PEXPIRETIME_Tips,0,pexpiretimeCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_KEYSPACE,PEXPIRETIME_Keyspecs,1,NULL,1),.args=PEXPIRETIME_Args}, {MAKE_CMD("pttl","Returns the expiration time in milliseconds of a key.","O(1)","2.6.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,PTTL_History,1,PTTL_Tips,1,pttlCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_KEYSPACE,PTTL_Keyspecs,1,NULL,1),.args=PTTL_Args}, -{MAKE_CMD("randomkey","Returns a random key name from the database.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,RANDOMKEY_History,0,RANDOMKEY_Tips,2,randomkeyCommand,1,CMD_READONLY|CMD_TOUCHES_ARBITRARY_KEYS,ACL_CATEGORY_KEYSPACE,RANDOMKEY_Keyspecs,0,NULL,0)}, +{MAKE_CMD("randomkey","Returns a random key name from the database.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,RANDOMKEY_History,0,RANDOMKEY_Tips,3,randomkeyCommand,1,CMD_READONLY|CMD_TOUCHES_ARBITRARY_KEYS,ACL_CATEGORY_KEYSPACE,RANDOMKEY_Keyspecs,0,NULL,0)}, {MAKE_CMD("rename","Renames a key and overwrites the destination.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,RENAME_History,0,RENAME_Tips,0,renameCommand,3,CMD_WRITE,ACL_CATEGORY_KEYSPACE,RENAME_Keyspecs,2,NULL,2),.args=RENAME_Args}, {MAKE_CMD("renamenx","Renames a key only when the target key name doesn't exist.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,RENAMENX_History,1,RENAMENX_Tips,0,renamenxCommand,3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_KEYSPACE,RENAMENX_Keyspecs,2,NULL,2),.args=RENAMENX_Args}, {MAKE_CMD("restore","Creates a key from the serialized representation of a value.","O(1) to create the new key and additional O(N*M) to reconstruct the serialized value, where N is the number of Redis objects composing the value and M their average size. For small string values the time complexity is thus O(1)+O(1*M) where M is small, so simply O(1). However for sorted set values the complexity is O(N*M*log(N)) because inserting values into sorted sets is O(log(N)).","2.6.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,RESTORE_History,3,RESTORE_Tips,0,restoreCommand,-4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_KEYSPACE|ACL_CATEGORY_DANGEROUS,RESTORE_Keyspecs,1,NULL,7),.args=RESTORE_Args}, -{MAKE_CMD("scan","Iterates over the key names in the database.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,SCAN_History,1,SCAN_Tips,2,scanCommand,-2,CMD_READONLY|CMD_TOUCHES_ARBITRARY_KEYS,ACL_CATEGORY_KEYSPACE,SCAN_Keyspecs,0,NULL,4),.args=SCAN_Args}, +{MAKE_CMD("scan","Iterates over the key names in the database.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,SCAN_History,1,SCAN_Tips,3,scanCommand,-2,CMD_READONLY|CMD_TOUCHES_ARBITRARY_KEYS,ACL_CATEGORY_KEYSPACE,SCAN_Keyspecs,0,NULL,4),.args=SCAN_Args}, {MAKE_CMD("sort","Sorts the elements in a list, a set, or a sorted set, optionally storing the result.","O(N+M*log(M)) where N is the number of elements in the list or set to sort, and M the number of returned elements. When the elements are not sorted, complexity is O(N).","1.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,SORT_History,0,SORT_Tips,0,sortCommand,-2,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_SET|ACL_CATEGORY_SORTEDSET|ACL_CATEGORY_LIST|ACL_CATEGORY_DANGEROUS,SORT_Keyspecs,3,sortGetKeys,7),.args=SORT_Args}, {MAKE_CMD("sort_ro","Returns the sorted elements of a list, a set, or a sorted set.","O(N+M*log(M)) where N is the number of elements in the list or set to sort, and M the number of returned elements. When the elements are not sorted, complexity is O(N).","7.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,SORT_RO_History,0,SORT_RO_Tips,0,sortroCommand,-2,CMD_READONLY,ACL_CATEGORY_SET|ACL_CATEGORY_SORTEDSET|ACL_CATEGORY_LIST|ACL_CATEGORY_DANGEROUS,SORT_RO_Keyspecs,2,sortROGetKeys,6),.args=SORT_RO_Args}, {MAKE_CMD("touch","Returns the number of existing keys out of those specified after updating the time they were last accessed.","O(N) where N is the number of keys that will be touched.","3.2.1",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,TOUCH_History,0,TOUCH_Tips,2,touchCommand,-2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_KEYSPACE,TOUCH_Keyspecs,1,NULL,1),.args=TOUCH_Args}, {MAKE_CMD("ttl","Returns the expiration time in seconds of a key.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,TTL_History,1,TTL_Tips,1,ttlCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_KEYSPACE,TTL_Keyspecs,1,NULL,1),.args=TTL_Args}, {MAKE_CMD("type","Determines the type of value stored at a key.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,TYPE_History,0,TYPE_Tips,0,typeCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_KEYSPACE,TYPE_Keyspecs,1,NULL,1),.args=TYPE_Args}, {MAKE_CMD("unlink","Asynchronously deletes one or more keys.","O(1) for each key removed regardless of its size. Then the command does O(N) work in a different thread in order to reclaim memory, where N is the number of allocations the deleted objects where composed of.","4.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,UNLINK_History,0,UNLINK_Tips,2,unlinkCommand,-2,CMD_WRITE|CMD_FAST,ACL_CATEGORY_KEYSPACE,UNLINK_Keyspecs,1,NULL,1),.args=UNLINK_Args}, -{MAKE_CMD("wait","Blocks until the asynchronous replication of all preceding write commands sent by the connection is completed.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,WAIT_History,0,WAIT_Tips,2,waitCommand,3,0,ACL_CATEGORY_CONNECTION,WAIT_Keyspecs,0,NULL,2),.args=WAIT_Args}, -{MAKE_CMD("waitaof","Blocks until all of the preceding write commands sent by the connection are written to the append-only file of the master and/or replicas.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,WAITAOF_History,0,WAITAOF_Tips,2,waitaofCommand,4,CMD_NOSCRIPT,ACL_CATEGORY_CONNECTION,WAITAOF_Keyspecs,0,NULL,3),.args=WAITAOF_Args}, +{MAKE_CMD("wait","Blocks until the asynchronous replication of all preceding write commands sent by the connection is completed.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,WAIT_History,0,WAIT_Tips,2,waitCommand,3,CMD_BLOCKING,ACL_CATEGORY_CONNECTION,WAIT_Keyspecs,0,NULL,2),.args=WAIT_Args}, +{MAKE_CMD("waitaof","Blocks until all of the preceding write commands sent by the connection are written to the append-only file of the master and/or replicas.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"generic",COMMAND_GROUP_GENERIC,WAITAOF_History,0,WAITAOF_Tips,2,waitaofCommand,4,CMD_BLOCKING,ACL_CATEGORY_CONNECTION,WAITAOF_Keyspecs,0,NULL,3),.args=WAITAOF_Args}, /* geo */ {MAKE_CMD("geoadd","Adds one or more members to a geospatial index. The key is created if it doesn't exist.","O(log(N)) for each item added, where N is the number of elements in the sorted set.","3.2.0",CMD_DOC_NONE,NULL,NULL,"geo",COMMAND_GROUP_GEO,GEOADD_History,1,GEOADD_Tips,0,geoaddCommand,-5,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_GEO,GEOADD_Keyspecs,1,NULL,4),.args=GEOADD_Args}, -{MAKE_CMD("geodist","Returns the distance between two members of a geospatial index.","O(log(N))","3.2.0",CMD_DOC_NONE,NULL,NULL,"geo",COMMAND_GROUP_GEO,GEODIST_History,0,GEODIST_Tips,0,geodistCommand,-4,CMD_READONLY,ACL_CATEGORY_GEO,GEODIST_Keyspecs,1,NULL,4),.args=GEODIST_Args}, -{MAKE_CMD("geohash","Returns members from a geospatial index as geohash strings.","O(log(N)) for each member requested, where N is the number of elements in the sorted set.","3.2.0",CMD_DOC_NONE,NULL,NULL,"geo",COMMAND_GROUP_GEO,GEOHASH_History,0,GEOHASH_Tips,0,geohashCommand,-2,CMD_READONLY,ACL_CATEGORY_GEO,GEOHASH_Keyspecs,1,NULL,2),.args=GEOHASH_Args}, -{MAKE_CMD("geopos","Returns the longitude and latitude of members from a geospatial index.","O(N) where N is the number of members requested.","3.2.0",CMD_DOC_NONE,NULL,NULL,"geo",COMMAND_GROUP_GEO,GEOPOS_History,0,GEOPOS_Tips,0,geoposCommand,-2,CMD_READONLY,ACL_CATEGORY_GEO,GEOPOS_Keyspecs,1,NULL,2),.args=GEOPOS_Args}, +{MAKE_CMD("geodist","Returns the distance between two members of a geospatial index.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"geo",COMMAND_GROUP_GEO,GEODIST_History,0,GEODIST_Tips,0,geodistCommand,-4,CMD_READONLY,ACL_CATEGORY_GEO,GEODIST_Keyspecs,1,NULL,4),.args=GEODIST_Args}, +{MAKE_CMD("geohash","Returns members from a geospatial index as geohash strings.","O(1) for each member requested.","3.2.0",CMD_DOC_NONE,NULL,NULL,"geo",COMMAND_GROUP_GEO,GEOHASH_History,0,GEOHASH_Tips,0,geohashCommand,-2,CMD_READONLY,ACL_CATEGORY_GEO,GEOHASH_Keyspecs,1,NULL,2),.args=GEOHASH_Args}, +{MAKE_CMD("geopos","Returns the longitude and latitude of members from a geospatial index.","O(1) for each member requested.","3.2.0",CMD_DOC_NONE,NULL,NULL,"geo",COMMAND_GROUP_GEO,GEOPOS_History,0,GEOPOS_Tips,0,geoposCommand,-2,CMD_READONLY,ACL_CATEGORY_GEO,GEOPOS_Keyspecs,1,NULL,2),.args=GEOPOS_Args}, {MAKE_CMD("georadius","Queries a geospatial index for members within a distance from a coordinate, optionally stores the result.","O(N+log(M)) where N is the number of elements inside the bounding box of the circular area delimited by center and radius and M is the number of items inside the index.","3.2.0",CMD_DOC_DEPRECATED,"`GEOSEARCH` and `GEOSEARCHSTORE` with the `BYRADIUS` argument","6.2.0","geo",COMMAND_GROUP_GEO,GEORADIUS_History,2,GEORADIUS_Tips,0,georadiusCommand,-6,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_GEO,GEORADIUS_Keyspecs,3,georadiusGetKeys,11),.args=GEORADIUS_Args}, -{MAKE_CMD("georadiusbymember","Queries a geospatial index for members within a distance from a member, optionally stores the result.","O(N+log(M)) where N is the number of elements inside the bounding box of the circular area delimited by center and radius and M is the number of items inside the index.","3.2.0",CMD_DOC_DEPRECATED,"`GEOSEARCH` and `GEOSEARCHSTORE` with the `BYRADIUS` and `FROMMEMBER` arguments","6.2.0","geo",COMMAND_GROUP_GEO,GEORADIUSBYMEMBER_History,1,GEORADIUSBYMEMBER_Tips,0,georadiusbymemberCommand,-5,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_GEO,GEORADIUSBYMEMBER_Keyspecs,3,georadiusGetKeys,10),.args=GEORADIUSBYMEMBER_Args}, -{MAKE_CMD("georadiusbymember_ro","Returns members from a geospatial index that are within a distance from a member.","O(N+log(M)) where N is the number of elements inside the bounding box of the circular area delimited by center and radius and M is the number of items inside the index.","3.2.10",CMD_DOC_DEPRECATED,"`GEOSEARCH` with the `BYRADIUS` and `FROMMEMBER` arguments","6.2.0","geo",COMMAND_GROUP_GEO,GEORADIUSBYMEMBER_RO_History,0,GEORADIUSBYMEMBER_RO_Tips,0,georadiusbymemberroCommand,-5,CMD_READONLY,ACL_CATEGORY_GEO,GEORADIUSBYMEMBER_RO_Keyspecs,1,NULL,9),.args=GEORADIUSBYMEMBER_RO_Args}, -{MAKE_CMD("georadius_ro","Returns members from a geospatial index that are within a distance from a coordinate.","O(N+log(M)) where N is the number of elements inside the bounding box of the circular area delimited by center and radius and M is the number of items inside the index.","3.2.10",CMD_DOC_DEPRECATED,"`GEOSEARCH` with the `BYRADIUS` argument","6.2.0","geo",COMMAND_GROUP_GEO,GEORADIUS_RO_History,1,GEORADIUS_RO_Tips,0,georadiusroCommand,-6,CMD_READONLY,ACL_CATEGORY_GEO,GEORADIUS_RO_Keyspecs,1,NULL,10),.args=GEORADIUS_RO_Args}, +{MAKE_CMD("georadiusbymember","Queries a geospatial index for members within a distance from a member, optionally stores the result.","O(N+log(M)) where N is the number of elements inside the bounding box of the circular area delimited by center and radius and M is the number of items inside the index.","3.2.0",CMD_DOC_DEPRECATED,"`GEOSEARCH` and `GEOSEARCHSTORE` with the `BYRADIUS` and `FROMMEMBER` arguments","6.2.0","geo",COMMAND_GROUP_GEO,GEORADIUSBYMEMBER_History,2,GEORADIUSBYMEMBER_Tips,0,georadiusbymemberCommand,-5,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_GEO,GEORADIUSBYMEMBER_Keyspecs,3,georadiusGetKeys,10),.args=GEORADIUSBYMEMBER_Args}, +{MAKE_CMD("georadiusbymember_ro","Returns members from a geospatial index that are within a distance from a member.","O(N+log(M)) where N is the number of elements inside the bounding box of the circular area delimited by center and radius and M is the number of items inside the index.","3.2.10",CMD_DOC_DEPRECATED,"`GEOSEARCH` with the `BYRADIUS` and `FROMMEMBER` arguments","6.2.0","geo",COMMAND_GROUP_GEO,GEORADIUSBYMEMBER_RO_History,2,GEORADIUSBYMEMBER_RO_Tips,0,georadiusbymemberroCommand,-5,CMD_READONLY,ACL_CATEGORY_GEO,GEORADIUSBYMEMBER_RO_Keyspecs,1,NULL,9),.args=GEORADIUSBYMEMBER_RO_Args}, +{MAKE_CMD("georadius_ro","Returns members from a geospatial index that are within a distance from a coordinate.","O(N+log(M)) where N is the number of elements inside the bounding box of the circular area delimited by center and radius and M is the number of items inside the index.","3.2.10",CMD_DOC_DEPRECATED,"`GEOSEARCH` with the `BYRADIUS` argument","6.2.0","geo",COMMAND_GROUP_GEO,GEORADIUS_RO_History,2,GEORADIUS_RO_Tips,0,georadiusroCommand,-6,CMD_READONLY,ACL_CATEGORY_GEO,GEORADIUS_RO_Keyspecs,1,NULL,10),.args=GEORADIUS_RO_Args}, {MAKE_CMD("geosearch","Queries a geospatial index for members inside an area of a box or a circle.","O(N+log(M)) where N is the number of elements in the grid-aligned bounding box area around the shape provided as the filter and M is the number of items inside the shape","6.2.0",CMD_DOC_NONE,NULL,NULL,"geo",COMMAND_GROUP_GEO,GEOSEARCH_History,1,GEOSEARCH_Tips,0,geosearchCommand,-7,CMD_READONLY,ACL_CATEGORY_GEO,GEOSEARCH_Keyspecs,1,NULL,8),.args=GEOSEARCH_Args}, {MAKE_CMD("geosearchstore","Queries a geospatial index for members inside an area of a box or a circle, optionally stores the result.","O(N+log(M)) where N is the number of elements in the grid-aligned bounding box area around the shape provided as the filter and M is the number of items inside the shape","6.2.0",CMD_DOC_NONE,NULL,NULL,"geo",COMMAND_GROUP_GEO,GEOSEARCHSTORE_History,1,GEOSEARCHSTORE_Tips,0,geosearchstoreCommand,-8,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_GEO,GEOSEARCHSTORE_Keyspecs,2,NULL,7),.args=GEOSEARCHSTORE_Args}, /* hash */ {MAKE_CMD("hdel","Deletes one or more fields and their values from a hash. Deletes the hash if no fields remain.","O(N) where N is the number of fields to be removed.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HDEL_History,1,HDEL_Tips,0,hdelCommand,-3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HDEL_Keyspecs,1,NULL,2),.args=HDEL_Args}, {MAKE_CMD("hexists","Determines whether a field exists in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXISTS_History,0,HEXISTS_Tips,0,hexistsCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HEXISTS_Keyspecs,1,NULL,2),.args=HEXISTS_Args}, +{MAKE_CMD("hexpire","Set expiry for hash field using relative time to expire (seconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIRE_History,0,HEXPIRE_Tips,0,hexpireCommand,-6,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HEXPIRE_Keyspecs,1,NULL,4),.args=HEXPIRE_Args}, +{MAKE_CMD("hexpireat","Set expiry for hash field using an absolute Unix timestamp (seconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIREAT_History,0,HEXPIREAT_Tips,0,hexpireatCommand,-6,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HEXPIREAT_Keyspecs,1,NULL,4),.args=HEXPIREAT_Args}, +{MAKE_CMD("hexpiretime","Returns the expiration time of a hash field as a Unix timestamp, in seconds.","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIRETIME_History,0,HEXPIRETIME_Tips,0,hexpiretimeCommand,-5,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HEXPIRETIME_Keyspecs,1,NULL,2),.args=HEXPIRETIME_Args}, {MAKE_CMD("hget","Returns the value of a field in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGET_History,0,HGET_Tips,0,hgetCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HGET_Keyspecs,1,NULL,2),.args=HGET_Args}, {MAKE_CMD("hgetall","Returns all fields and values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGETALL_History,0,HGETALL_Tips,1,hgetallCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HGETALL_Keyspecs,1,NULL,1),.args=HGETALL_Args}, {MAKE_CMD("hincrby","Increments the integer value of a field in a hash by a number. Uses 0 as initial value if the field doesn't exist.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HINCRBY_History,0,HINCRBY_Tips,0,hincrbyCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HINCRBY_Keyspecs,1,NULL,3),.args=HINCRBY_Args}, @@ -10656,11 +11040,17 @@ struct COMMAND_STRUCT redisCommandTable[] = { {MAKE_CMD("hlen","Returns the number of fields in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HLEN_History,0,HLEN_Tips,0,hlenCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HLEN_Keyspecs,1,NULL,1),.args=HLEN_Args}, {MAKE_CMD("hmget","Returns the values of all fields in a hash.","O(N) where N is the number of fields being requested.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HMGET_History,0,HMGET_Tips,0,hmgetCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HMGET_Keyspecs,1,NULL,2),.args=HMGET_Args}, {MAKE_CMD("hmset","Sets the values of multiple fields.","O(N) where N is the number of fields being set.","2.0.0",CMD_DOC_DEPRECATED,"`HSET` with multiple field-value pairs","4.0.0","hash",COMMAND_GROUP_HASH,HMSET_History,0,HMSET_Tips,0,hsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HMSET_Keyspecs,1,NULL,2),.args=HMSET_Args}, +{MAKE_CMD("hpersist","Removes the expiration time for each specified field","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPERSIST_History,0,HPERSIST_Tips,0,hpersistCommand,-5,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HPERSIST_Keyspecs,1,NULL,2),.args=HPERSIST_Args}, +{MAKE_CMD("hpexpire","Set expiry for hash field using relative time to expire (milliseconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPEXPIRE_History,0,HPEXPIRE_Tips,0,hpexpireCommand,-6,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HPEXPIRE_Keyspecs,1,NULL,4),.args=HPEXPIRE_Args}, +{MAKE_CMD("hpexpireat","Set expiry for hash field using an absolute Unix timestamp (milliseconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPEXPIREAT_History,0,HPEXPIREAT_Tips,0,hpexpireatCommand,-6,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HPEXPIREAT_Keyspecs,1,NULL,4),.args=HPEXPIREAT_Args}, +{MAKE_CMD("hpexpiretime","Returns the expiration time of a hash field as a Unix timestamp, in msec.","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPEXPIRETIME_History,0,HPEXPIRETIME_Tips,0,hpexpiretimeCommand,-5,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HPEXPIRETIME_Keyspecs,1,NULL,2),.args=HPEXPIRETIME_Args}, +{MAKE_CMD("hpttl","Returns the TTL in milliseconds of a hash field.","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPTTL_History,0,HPTTL_Tips,0,hpttlCommand,-5,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HPTTL_Keyspecs,1,NULL,2),.args=HPTTL_Args}, {MAKE_CMD("hrandfield","Returns one or more random fields from a hash.","O(N) where N is the number of fields returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HRANDFIELD_History,0,HRANDFIELD_Tips,1,hrandfieldCommand,-2,CMD_READONLY,ACL_CATEGORY_HASH,HRANDFIELD_Keyspecs,1,NULL,2),.args=HRANDFIELD_Args}, -{MAKE_CMD("hscan","Iterates over fields and values of a hash.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSCAN_History,0,HSCAN_Tips,1,hscanCommand,-3,CMD_READONLY,ACL_CATEGORY_HASH,HSCAN_Keyspecs,1,NULL,4),.args=HSCAN_Args}, +{MAKE_CMD("hscan","Iterates over fields and values of a hash.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSCAN_History,0,HSCAN_Tips,1,hscanCommand,-3,CMD_READONLY,ACL_CATEGORY_HASH,HSCAN_Keyspecs,1,NULL,5),.args=HSCAN_Args}, {MAKE_CMD("hset","Creates or modifies the value of a field in a hash.","O(1) for each field/value pair added, so O(N) to add N field/value pairs when the command is called with multiple field/value pairs.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSET_History,1,HSET_Tips,0,hsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HSET_Keyspecs,1,NULL,2),.args=HSET_Args}, {MAKE_CMD("hsetnx","Sets the value of a field in a hash only when the field doesn't exist.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSETNX_History,0,HSETNX_Tips,0,hsetnxCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HSETNX_Keyspecs,1,NULL,3),.args=HSETNX_Args}, {MAKE_CMD("hstrlen","Returns the length of the value of a field.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSTRLEN_History,0,HSTRLEN_Tips,0,hstrlenCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HSTRLEN_Keyspecs,1,NULL,2),.args=HSTRLEN_Args}, +{MAKE_CMD("httl","Returns the TTL in seconds of a hash field.","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HTTL_History,0,HTTL_Tips,0,httlCommand,-5,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HTTL_Keyspecs,1,NULL,2),.args=HTTL_Args}, {MAKE_CMD("hvals","Returns all values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HVALS_History,0,HVALS_Tips,1,hvalsCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HVALS_Keyspecs,1,NULL,1),.args=HVALS_Args}, /* hyperloglog */ {MAKE_CMD("pfadd","Adds elements to a HyperLogLog key. Creates the key if it doesn't exist.","O(1) to add every element.","2.8.9",CMD_DOC_NONE,NULL,NULL,"hyperloglog",COMMAND_GROUP_HYPERLOGLOG,PFADD_History,0,PFADD_Tips,0,pfaddCommand,-2,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HYPERLOGLOG,PFADD_Keyspecs,1,NULL,2),.args=PFADD_Args}, @@ -10732,12 +11122,12 @@ struct COMMAND_STRUCT redisCommandTable[] = { {MAKE_CMD("monitor","Listens for all requests received by the server in real-time.",NULL,"1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MONITOR_History,0,MONITOR_Tips,0,monitorCommand,1,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,MONITOR_Keyspecs,0,NULL,0)}, {MAKE_CMD("psync","An internal command used in replication.",NULL,"2.8.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,PSYNC_History,0,PSYNC_Tips,0,syncCommand,-3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NO_MULTI|CMD_NOSCRIPT,0,PSYNC_Keyspecs,0,NULL,2),.args=PSYNC_Args}, {MAKE_CMD("replconf","An internal command for configuring the replication stream.","O(1)","3.0.0",CMD_DOC_SYSCMD,NULL,NULL,"server",COMMAND_GROUP_SERVER,REPLCONF_History,0,REPLCONF_Tips,0,replconfCommand,-1,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_ALLOW_BUSY,0,REPLCONF_Keyspecs,0,NULL,0)}, -{MAKE_CMD("replicaof","Configures a server as replica of another, or promotes it to a master.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,REPLICAOF_History,0,REPLICAOF_Tips,0,replicaofCommand,3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT|CMD_STALE,0,REPLICAOF_Keyspecs,0,NULL,2),.args=REPLICAOF_Args}, +{MAKE_CMD("replicaof","Configures a server as replica of another, or promotes it to a master.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,REPLICAOF_History,0,REPLICAOF_Tips,0,replicaofCommand,3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT|CMD_STALE,0,REPLICAOF_Keyspecs,0,NULL,1),.args=REPLICAOF_Args}, {MAKE_CMD("restore-asking","An internal command for migrating keys in a cluster.","O(1) to create the new key and additional O(N*M) to reconstruct the serialized value, where N is the number of Redis objects composing the value and M their average size. For small string values the time complexity is thus O(1)+O(1*M) where M is small, so simply O(1). However for sorted set values the complexity is O(N*M*log(N)) because inserting values into sorted sets is O(log(N)).","3.0.0",CMD_DOC_SYSCMD,NULL,NULL,"server",COMMAND_GROUP_SERVER,RESTORE_ASKING_History,3,RESTORE_ASKING_Tips,0,restoreCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_ASKING,ACL_CATEGORY_KEYSPACE|ACL_CATEGORY_DANGEROUS,RESTORE_ASKING_Keyspecs,1,NULL,7),.args=RESTORE_ASKING_Args}, {MAKE_CMD("role","Returns the replication role.","O(1)","2.8.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,ROLE_History,0,ROLE_Tips,0,roleCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_SENTINEL,ACL_CATEGORY_ADMIN|ACL_CATEGORY_DANGEROUS,ROLE_Keyspecs,0,NULL,0)}, {MAKE_CMD("save","Synchronously saves the database(s) to disk.","O(N) where N is the total number of keys in all databases","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SAVE_History,0,SAVE_Tips,0,saveCommand,1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT|CMD_NO_MULTI,0,SAVE_Keyspecs,0,NULL,0)}, {MAKE_CMD("shutdown","Synchronously saves the database(s) to disk and shuts down the Redis server.","O(N) when saving, where N is the total number of keys in all databases when saving data, otherwise O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SHUTDOWN_History,1,SHUTDOWN_Tips,0,shutdownCommand,-1,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_NO_MULTI|CMD_SENTINEL|CMD_ALLOW_BUSY,0,SHUTDOWN_Keyspecs,0,NULL,4),.args=SHUTDOWN_Args}, -{MAKE_CMD("slaveof","Sets a Redis server as a replica of another, or promotes it to being a master.","O(1)","1.0.0",CMD_DOC_DEPRECATED,"`REPLICAOF`","5.0.0","server",COMMAND_GROUP_SERVER,SLAVEOF_History,0,SLAVEOF_Tips,0,replicaofCommand,3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT|CMD_STALE,0,SLAVEOF_Keyspecs,0,NULL,2),.args=SLAVEOF_Args}, +{MAKE_CMD("slaveof","Sets a Redis server as a replica of another, or promotes it to being a master.","O(1)","1.0.0",CMD_DOC_DEPRECATED,"`REPLICAOF`","5.0.0","server",COMMAND_GROUP_SERVER,SLAVEOF_History,0,SLAVEOF_Tips,0,replicaofCommand,3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT|CMD_STALE,0,SLAVEOF_Keyspecs,0,NULL,1),.args=SLAVEOF_Args}, {MAKE_CMD("slowlog","A container for slow log commands.","Depends on subcommand.","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_History,0,SLOWLOG_Tips,0,NULL,-2,0,0,SLOWLOG_Keyspecs,0,NULL,0),.subcommands=SLOWLOG_Subcommands}, {MAKE_CMD("swapdb","Swaps two Redis databases.","O(N) where N is the count of clients watching or blocking on keys from both databases.","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SWAPDB_History,0,SWAPDB_Tips,0,swapdbCommand,3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_KEYSPACE|ACL_CATEGORY_DANGEROUS,SWAPDB_Keyspecs,0,NULL,2),.args=SWAPDB_Args}, {MAKE_CMD("sync","An internal command used in replication.",NULL,"1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SYNC_History,0,SYNC_Tips,0,syncCommand,1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NO_MULTI|CMD_NOSCRIPT,0,SYNC_Keyspecs,0,NULL,0)}, @@ -10807,7 +11197,7 @@ struct COMMAND_STRUCT redisCommandTable[] = { {MAKE_CMD("xlen","Return the number of messages in a stream.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"stream",COMMAND_GROUP_STREAM,XLEN_History,0,XLEN_Tips,0,xlenCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_STREAM,XLEN_Keyspecs,1,NULL,1),.args=XLEN_Args}, {MAKE_CMD("xpending","Returns the information and entries from a stream consumer group's pending entries list.","O(N) with N being the number of elements returned, so asking for a small fixed number of entries per call is O(1). O(M), where M is the total number of entries scanned when used with the IDLE filter. When the command returns just the summary and the list of consumers is small, it runs in O(1) time; otherwise, an additional O(N) time for iterating every consumer.","5.0.0",CMD_DOC_NONE,NULL,NULL,"stream",COMMAND_GROUP_STREAM,XPENDING_History,1,XPENDING_Tips,1,xpendingCommand,-3,CMD_READONLY,ACL_CATEGORY_STREAM,XPENDING_Keyspecs,1,NULL,3),.args=XPENDING_Args}, {MAKE_CMD("xrange","Returns the messages from a stream within a range of IDs.","O(N) with N being the number of elements being returned. If N is constant (e.g. always asking for the first 10 elements with COUNT), you can consider it O(1).","5.0.0",CMD_DOC_NONE,NULL,NULL,"stream",COMMAND_GROUP_STREAM,XRANGE_History,1,XRANGE_Tips,0,xrangeCommand,-4,CMD_READONLY,ACL_CATEGORY_STREAM,XRANGE_Keyspecs,1,NULL,4),.args=XRANGE_Args}, -{MAKE_CMD("xread","Returns messages from multiple streams with IDs greater than the ones requested. Blocks until a message is available otherwise.",NULL,"5.0.0",CMD_DOC_NONE,NULL,NULL,"stream",COMMAND_GROUP_STREAM,XREAD_History,0,XREAD_Tips,0,xreadCommand,-4,CMD_BLOCKING|CMD_READONLY|CMD_BLOCKING,ACL_CATEGORY_STREAM,XREAD_Keyspecs,1,xreadGetKeys,3),.args=XREAD_Args}, +{MAKE_CMD("xread","Returns messages from multiple streams with IDs greater than the ones requested. Blocks until a message is available otherwise.",NULL,"5.0.0",CMD_DOC_NONE,NULL,NULL,"stream",COMMAND_GROUP_STREAM,XREAD_History,0,XREAD_Tips,0,xreadCommand,-4,CMD_BLOCKING|CMD_READONLY,ACL_CATEGORY_STREAM,XREAD_Keyspecs,1,xreadGetKeys,3),.args=XREAD_Args}, {MAKE_CMD("xreadgroup","Returns new or historical messages from a stream for a consumer in a group. Blocks until a message is available otherwise.","For each stream mentioned: O(M) with M being the number of elements returned. If M is constant (e.g. always asking for the first 10 elements with COUNT), you can consider it O(1). On the other side when XREADGROUP blocks, XADD will pay the O(N) time in order to serve the N clients blocked on the stream getting new data.","5.0.0",CMD_DOC_NONE,NULL,NULL,"stream",COMMAND_GROUP_STREAM,XREADGROUP_History,0,XREADGROUP_Tips,0,xreadCommand,-7,CMD_BLOCKING|CMD_WRITE,ACL_CATEGORY_STREAM,XREADGROUP_Keyspecs,1,xreadGetKeys,5),.args=XREADGROUP_Args}, {MAKE_CMD("xrevrange","Returns the messages from a stream within a range of IDs in reverse order.","O(N) with N being the number of elements returned. If N is constant (e.g. always asking for the first 10 elements with COUNT), you can consider it O(1).","5.0.0",CMD_DOC_NONE,NULL,NULL,"stream",COMMAND_GROUP_STREAM,XREVRANGE_History,1,XREVRANGE_Tips,0,xrevrangeCommand,-4,CMD_READONLY,ACL_CATEGORY_STREAM,XREVRANGE_Keyspecs,1,NULL,4),.args=XREVRANGE_Args}, {MAKE_CMD("xsetid","An internal command for replicating stream values.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"stream",COMMAND_GROUP_STREAM,XSETID_History,1,XSETID_Tips,0,xsetidCommand,-3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_STREAM,XSETID_Keyspecs,1,NULL,4),.args=XSETID_Args}, @@ -10827,7 +11217,7 @@ struct COMMAND_STRUCT redisCommandTable[] = { {MAKE_CMD("lcs","Finds the longest common substring.","O(N*M) where N and M are the lengths of s1 and s2, respectively","7.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,LCS_History,0,LCS_Tips,0,lcsCommand,-3,CMD_READONLY,ACL_CATEGORY_STRING,LCS_Keyspecs,1,NULL,6),.args=LCS_Args}, {MAKE_CMD("mget","Atomically returns the string values of one or more keys.","O(N) where N is the number of keys to retrieve.","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,MGET_History,0,MGET_Tips,1,mgetCommand,-2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_STRING,MGET_Keyspecs,1,NULL,1),.args=MGET_Args}, {MAKE_CMD("mset","Atomically creates or modifies the string values of one or more keys.","O(N) where N is the number of keys to set.","1.0.1",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,MSET_History,0,MSET_Tips,2,msetCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,MSET_Keyspecs,1,NULL,1),.args=MSET_Args}, -{MAKE_CMD("msetnx","Atomically modifies the string values of one or more keys only when all keys don't exist.","O(N) where N is the number of keys to set.","1.0.1",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,MSETNX_History,0,MSETNX_Tips,2,msetnxCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,MSETNX_Keyspecs,1,NULL,1),.args=MSETNX_Args}, +{MAKE_CMD("msetnx","Atomically modifies the string values of one or more keys only when all keys don't exist.","O(N) where N is the number of keys to set.","1.0.1",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,MSETNX_History,0,MSETNX_Tips,0,msetnxCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,MSETNX_Keyspecs,1,NULL,1),.args=MSETNX_Args}, {MAKE_CMD("psetex","Sets both string value and expiration time in milliseconds of a key. The key is created if it doesn't exist.","O(1)","2.6.0",CMD_DOC_DEPRECATED,"`SET` with the `PX` argument","2.6.12","string",COMMAND_GROUP_STRING,PSETEX_History,0,PSETEX_Tips,0,psetexCommand,4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,PSETEX_Keyspecs,1,NULL,3),.args=PSETEX_Args}, {MAKE_CMD("set","Sets the string value of a key, ignoring its type. The key is created if it doesn't exist.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,SET_History,4,SET_Tips,0,setCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,SET_Keyspecs,1,setGetKeys,5),.args=SET_Args}, {MAKE_CMD("setex","Sets the string value and expiration time of a key. Creates the key if it doesn't exist.","O(1)","2.0.0",CMD_DOC_DEPRECATED,"`SET` with the `EX` argument","2.6.12","string",COMMAND_GROUP_STRING,SETEX_History,0,SETEX_Tips,0,setexCommand,4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,SETEX_Keyspecs,1,NULL,3),.args=SETEX_Args}, diff --git a/src/commands.h b/src/commands.h index 52acacfe0b1..1eefab4812b 100644 --- a/src/commands.h +++ b/src/commands.h @@ -19,7 +19,7 @@ typedef enum { #define CMD_ARG_MULTIPLE (1<<1) #define CMD_ARG_MULTIPLE_TOKEN (1<<2) -/* WARNING! This struct must match RedisModuleCommandArg */ +/* Must be compatible with RedisModuleCommandArg. See moduleCopyCommandArgs. */ typedef struct redisCommandArg { const char *name; redisCommandArgType type; diff --git a/src/commands/README.md b/src/commands/README.md new file mode 100644 index 00000000000..5ef9a3f2905 --- /dev/null +++ b/src/commands/README.md @@ -0,0 +1,15 @@ +This directory contains JSON files, one for each of Redis commands. + +Each JSON contains all the information about the command itself, but these JSON files are not to be used directly! +Any third party who needs access to command information must get it from `COMMAND INFO` and `COMMAND DOCS`. +The output can be extracted in a JSON format by using `redis-cli --json`, in the same manner as in `utils/generate-commands-json.py`. + +The JSON files are used to generate commands.def (and https://github.com/redis/redis-doc/blob/master/commands.json) in Redis, and +despite looking similar to the output of `COMMAND` there are some fields and flags that are implicitly populated, and that's the +reason one shouldn't rely on the raw files. + +The structure of each JSON is somewhat documented in https://redis.io/commands/command-docs/ and https://redis.io/commands/command/ + +The `reply_schema` section is a standard JSON Schema (see https://json-schema.org/) that describes the reply of each command. +It is designed to someday be used to auto-generate code in client libraries, but is not yet mature and is not exposed externally. + diff --git a/src/commands/acl-deluser.json b/src/commands/acl-deluser.json index 4fc106503d4..80e8a7ad51b 100644 --- a/src/commands/acl-deluser.json +++ b/src/commands/acl-deluser.json @@ -14,6 +14,10 @@ "STALE", "SENTINEL" ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "RESPONSE_POLICY:ALL_SUCCEEDED" + ], "reply_schema": { "type": "integer", "description": "The number of users that were deleted" diff --git a/src/commands/acl-save.json b/src/commands/acl-save.json index 0b2af21e650..98d8dfd34bf 100644 --- a/src/commands/acl-save.json +++ b/src/commands/acl-save.json @@ -14,6 +14,10 @@ "STALE", "SENTINEL" ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "RESPONSE_POLICY:ALL_SUCCEEDED" + ], "reply_schema": { "const": "OK" } diff --git a/src/commands/acl-setuser.json b/src/commands/acl-setuser.json index e26df464fe5..1a909170f1b 100644 --- a/src/commands/acl-setuser.json +++ b/src/commands/acl-setuser.json @@ -24,6 +24,10 @@ "STALE", "SENTINEL" ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "RESPONSE_POLICY:ALL_SUCCEEDED" + ], "reply_schema": { "const": "OK" }, diff --git a/src/commands/client-kill.json b/src/commands/client-kill.json index bd0262d4e99..17f848cdae3 100644 --- a/src/commands/client-kill.json +++ b/src/commands/client-kill.json @@ -27,6 +27,10 @@ [ "6.2.0", "`LADDR` option." + ], + [ + "7.4.0", + "`MAXAGE` option." ] ], "command_flags": [ @@ -136,6 +140,13 @@ "token": "NO" } ] + }, + { + "token": "MAXAGE", + "name": "maxage", + "type": "integer", + "optional": true, + "since": "7.4.0" } ] } diff --git a/src/commands/client-setinfo.json b/src/commands/client-setinfo.json index e61ba56645d..d0d8f7318f9 100644 --- a/src/commands/client-setinfo.json +++ b/src/commands/client-setinfo.json @@ -13,6 +13,10 @@ "STALE", "SENTINEL" ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "RESPONSE_POLICY:ALL_SUCCEEDED" + ], "acl_categories": [ "CONNECTION" ], diff --git a/src/commands/client-setname.json b/src/commands/client-setname.json index e8920b686bc..b071bd18ff9 100644 --- a/src/commands/client-setname.json +++ b/src/commands/client-setname.json @@ -13,6 +13,10 @@ "STALE", "SENTINEL" ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "RESPONSE_POLICY:ALL_SUCCEEDED" + ], "acl_categories": [ "CONNECTION" ], diff --git a/src/commands/cluster-replicas.json b/src/commands/cluster-replicas.json index 49a9227705e..e01617feebf 100644 --- a/src/commands/cluster-replicas.json +++ b/src/commands/cluster-replicas.json @@ -1,7 +1,7 @@ { "REPLICAS": { "summary": "Lists the replica nodes of a master node.", - "complexity": "O(1)", + "complexity": "O(N) where N is the number of replicas.", "group": "cluster", "since": "5.0.0", "arity": 3, diff --git a/src/commands/cluster-shards.json b/src/commands/cluster-shards.json index dcaad3ea3eb..e7a08295347 100644 --- a/src/commands/cluster-shards.json +++ b/src/commands/cluster-shards.json @@ -26,7 +26,7 @@ "description": "an even number element array specifying the start and end slot numbers for slot ranges owned by this shard", "type": "array", "items": { - "type": "string" + "type": "integer" } }, "nodes": { diff --git a/src/commands/cluster-slaves.json b/src/commands/cluster-slaves.json index a2e6755a0a0..a736088e4c9 100644 --- a/src/commands/cluster-slaves.json +++ b/src/commands/cluster-slaves.json @@ -1,7 +1,7 @@ { "SLAVES": { "summary": "Lists the replica nodes of a master node.", - "complexity": "O(1)", + "complexity": "O(N) where N is the number of replicas.", "group": "cluster", "since": "3.0.0", "arity": 3, diff --git a/src/commands/config-resetstat.json b/src/commands/config-resetstat.json index 87a08972a5d..fd6701f0d9e 100644 --- a/src/commands/config-resetstat.json +++ b/src/commands/config-resetstat.json @@ -13,6 +13,10 @@ "LOADING", "STALE" ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "RESPONSE_POLICY:ALL_SUCCEEDED" + ], "reply_schema": { "const": "OK" } diff --git a/src/commands/config-rewrite.json b/src/commands/config-rewrite.json index 490e2f8e870..af49dd770f8 100644 --- a/src/commands/config-rewrite.json +++ b/src/commands/config-rewrite.json @@ -13,6 +13,10 @@ "LOADING", "STALE" ], + "command_tips": [ + "REQUEST_POLICY:ALL_NODES", + "RESPONSE_POLICY:ALL_SUCCEEDED" + ], "reply_schema": { "const": "OK" } diff --git a/src/commands/geodist.json b/src/commands/geodist.json index 97969d332c7..145ca718a87 100644 --- a/src/commands/geodist.json +++ b/src/commands/geodist.json @@ -1,7 +1,7 @@ { "GEODIST": { "summary": "Returns the distance between two members of a geospatial index.", - "complexity": "O(log(N))", + "complexity": "O(1)", "group": "geo", "since": "3.2.0", "arity": -4, diff --git a/src/commands/geohash.json b/src/commands/geohash.json index 8f4d55a62b2..01402c4657a 100644 --- a/src/commands/geohash.json +++ b/src/commands/geohash.json @@ -1,7 +1,7 @@ { "GEOHASH": { "summary": "Returns members from a geospatial index as geohash strings.", - "complexity": "O(log(N)) for each member requested, where N is the number of elements in the sorted set.", + "complexity": "O(1) for each member requested.", "group": "geo", "since": "3.2.0", "arity": -2, diff --git a/src/commands/geopos.json b/src/commands/geopos.json index 5473c1b76fa..408b6e6a39f 100644 --- a/src/commands/geopos.json +++ b/src/commands/geopos.json @@ -1,7 +1,7 @@ { "GEOPOS": { "summary": "Returns the longitude and latitude of members from a geospatial index.", - "complexity": "O(N) where N is the number of members requested.", + "complexity": "O(1) for each member requested.", "group": "geo", "since": "3.2.0", "arity": -2, diff --git a/src/commands/georadius_ro.json b/src/commands/georadius_ro.json index 964246a2094..b3d335d4a1f 100644 --- a/src/commands/georadius_ro.json +++ b/src/commands/georadius_ro.json @@ -10,6 +10,10 @@ [ "6.2.0", "Added the `ANY` option for `COUNT`." + ], + [ + "7.0.0", + "Added support for uppercase unit names." ] ], "deprecated_since": "6.2.0", diff --git a/src/commands/georadiusbymember.json b/src/commands/georadiusbymember.json index 4b627419baa..6102a1b163e 100644 --- a/src/commands/georadiusbymember.json +++ b/src/commands/georadiusbymember.json @@ -8,6 +8,10 @@ "function": "georadiusbymemberCommand", "get_keys_function": "georadiusGetKeys", "history": [ + [ + "6.2.0", + "Added the `ANY` option for `COUNT`." + ], [ "7.0.0", "Added support for uppercase unit names." diff --git a/src/commands/georadiusbymember_ro.json b/src/commands/georadiusbymember_ro.json index 59258819171..0cc599feff9 100644 --- a/src/commands/georadiusbymember_ro.json +++ b/src/commands/georadiusbymember_ro.json @@ -6,6 +6,16 @@ "since": "3.2.10", "arity": -5, "function": "georadiusbymemberroCommand", + "history": [ + [ + "6.2.0", + "Added the `ANY` option for `COUNT`." + ], + [ + "7.0.0", + "Added support for uppercase unit names." + ] + ], "deprecated_since": "6.2.0", "replaced_by": "`GEOSEARCH` with the `BYRADIUS` and `FROMMEMBER` arguments", "doc_flags": [ diff --git a/src/commands/hexpire.json b/src/commands/hexpire.json new file mode 100644 index 00000000000..832c182aea2 --- /dev/null +++ b/src/commands/hexpire.json @@ -0,0 +1,119 @@ +{ + "HEXPIRE": { + "summary": "Set expiry for hash field using relative time to expire (seconds)", + "complexity": "O(N) where N is the number of specified fields", + "group": "hash", + "since": "7.4.0", + "arity": -6, + "function": "hexpireCommand", + "history": [], + "command_flags": [ + "WRITE", + "DENYOOM", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Array of results. Returns empty array if the key does not exist.", + "type": "array", + "minItems": 0, + "maxItems": 4294967295, + "items": { + "oneOf": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "Specified NX | XX | GT | LT condition not met", + "const": 0 + }, + { + "description": "Expiration time was set or updated.", + "const": 1 + }, + { + "description": "Field deleted because the specified expiration time is in the past.", + "const": 2 + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "seconds", + "type": "integer" + }, + { + "name": "condition", + "type": "oneof", + "optional": true, + "arguments": [ + { + "name": "nx", + "type": "pure-token", + "token": "NX" + }, + { + "name": "xx", + "type": "pure-token", + "token": "XX" + }, + { + "name": "gt", + "type": "pure-token", + "token": "GT" + }, + { + "name": "lt", + "type": "pure-token", + "token": "LT" + } + ] + }, + { + "name": "fields", + "token": "FIELDS", + "type": "block", + "arguments": [ + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } + ] + } +} diff --git a/src/commands/hexpireat.json b/src/commands/hexpireat.json new file mode 100644 index 00000000000..4a7c0c71886 --- /dev/null +++ b/src/commands/hexpireat.json @@ -0,0 +1,119 @@ +{ + "HEXPIREAT": { + "summary": "Set expiry for hash field using an absolute Unix timestamp (seconds)", + "complexity": "O(N) where N is the number of specified fields", + "group": "hash", + "since": "7.4.0", + "arity": -6, + "function": "hexpireatCommand", + "history": [], + "command_flags": [ + "WRITE", + "DENYOOM", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Array of results. Returns empty array if the key does not exist.", + "type": "array", + "minItems": 0, + "maxItems": 4294967295, + "items": { + "oneOf": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "Specified NX | XX | GT | LT condition not met", + "const": 0 + }, + { + "description": "Expiration time was set or updated.", + "const": 1 + }, + { + "description": "Field deleted because the specified expiration time is in the past.", + "const": 2 + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "unix-time-seconds", + "type": "unix-time" + }, + { + "name": "condition", + "type": "oneof", + "optional": true, + "arguments": [ + { + "name": "nx", + "type": "pure-token", + "token": "NX" + }, + { + "name": "xx", + "type": "pure-token", + "token": "XX" + }, + { + "name": "gt", + "type": "pure-token", + "token": "GT" + }, + { + "name": "lt", + "type": "pure-token", + "token": "LT" + } + ] + }, + { + "name": "fields", + "token": "FIELDS", + "type": "block", + "arguments": [ + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } + ] + } +} \ No newline at end of file diff --git a/src/commands/hexpiretime.json b/src/commands/hexpiretime.json new file mode 100644 index 00000000000..28c1e5f4baa --- /dev/null +++ b/src/commands/hexpiretime.json @@ -0,0 +1,84 @@ +{ + "HEXPIRETIME": { + "summary": "Returns the expiration time of a hash field as a Unix timestamp, in seconds.", + "complexity": "O(N) where N is the number of specified fields", + "group": "hash", + "since": "7.4.0", + "arity": -5, + "function": "hexpiretimeCommand", + "history": [], + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Array of results. Returns empty array if the key does not exist.", + "type": "array", + "minItems": 0, + "maxItems": 4294967295, + "items": { + "oneOf": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "The field exists but has no associated expire.", + "const": -1 + }, + { + "description": "Expiration Unix timestamp in seconds.", + "type": "integer", + "minimum": 1 + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "fields", + "token": "FIELDS", + "type": "block", + "arguments": [ + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } + ] + } +} diff --git a/src/commands/hpersist.json b/src/commands/hpersist.json new file mode 100644 index 00000000000..e7c1cb11bc9 --- /dev/null +++ b/src/commands/hpersist.json @@ -0,0 +1,83 @@ +{ + "HPERSIST": { + "summary": "Removes the expiration time for each specified field", + "complexity": "O(N) where N is the number of specified fields", + "group": "hash", + "since": "7.4.0", + "arity": -5, + "function": "hpersistCommand", + "history": [], + "command_flags": [ + "WRITE", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Array of results. Returns empty array if the key does not exist.", + "type": "array", + "minItems": 0, + "maxItems": 4294967295, + "items": { + "oneOf": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "The field exists but has no associated expire.", + "const": -1 + }, + { + "description": "Expiration time was removed", + "const": 1 + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "fields", + "token": "FIELDS", + "type": "block", + "arguments": [ + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } + ] + } +} diff --git a/src/commands/hpexpire.json b/src/commands/hpexpire.json new file mode 100644 index 00000000000..02c68e61634 --- /dev/null +++ b/src/commands/hpexpire.json @@ -0,0 +1,119 @@ +{ + "HPEXPIRE": { + "summary": "Set expiry for hash field using relative time to expire (milliseconds)", + "complexity": "O(N) where N is the number of specified fields", + "group": "hash", + "since": "7.4.0", + "arity": -6, + "function": "hpexpireCommand", + "history": [], + "command_flags": [ + "WRITE", + "DENYOOM", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Array of results. Returns empty array if the key does not exist.", + "type": "array", + "minItems": 0, + "maxItems": 4294967295, + "items": { + "oneOf": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "Specified NX | XX | GT | LT condition not met", + "const": 0 + }, + { + "description": "Expiration time was set or updated.", + "const": 1 + }, + { + "description": "Field deleted because the specified expiration time is in the past.", + "const": 2 + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "milliseconds", + "type": "integer" + }, + { + "name": "condition", + "type": "oneof", + "optional": true, + "arguments": [ + { + "name": "nx", + "type": "pure-token", + "token": "NX" + }, + { + "name": "xx", + "type": "pure-token", + "token": "XX" + }, + { + "name": "gt", + "type": "pure-token", + "token": "GT" + }, + { + "name": "lt", + "type": "pure-token", + "token": "LT" + } + ] + }, + { + "name": "fields", + "token": "FIELDS", + "type": "block", + "arguments": [ + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } + ] + } +} \ No newline at end of file diff --git a/src/commands/hpexpireat.json b/src/commands/hpexpireat.json new file mode 100644 index 00000000000..58e5555fb5f --- /dev/null +++ b/src/commands/hpexpireat.json @@ -0,0 +1,119 @@ +{ + "HPEXPIREAT": { + "summary": "Set expiry for hash field using an absolute Unix timestamp (milliseconds)", + "complexity": "O(N) where N is the number of specified fields", + "group": "hash", + "since": "7.4.0", + "arity": -6, + "function": "hpexpireatCommand", + "history": [], + "command_flags": [ + "WRITE", + "DENYOOM", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RW", + "UPDATE" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Array of results. Returns empty array if the key does not exist.", + "type": "array", + "minItems": 0, + "maxItems": 4294967295, + "items": { + "oneOf": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "Specified NX | XX | GT | LT condition not met", + "const": 0 + }, + { + "description": "Expiration time was set or updated.", + "const": 1 + }, + { + "description": "Field deleted because the specified expiration time is in the past.", + "const": 2 + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "unix-time-milliseconds", + "type": "unix-time" + }, + { + "name": "condition", + "type": "oneof", + "optional": true, + "arguments": [ + { + "name": "nx", + "type": "pure-token", + "token": "NX" + }, + { + "name": "xx", + "type": "pure-token", + "token": "XX" + }, + { + "name": "gt", + "type": "pure-token", + "token": "GT" + }, + { + "name": "lt", + "type": "pure-token", + "token": "LT" + } + ] + }, + { + "name": "fields", + "token": "FIELDS", + "type": "block", + "arguments": [ + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } + ] + } +} \ No newline at end of file diff --git a/src/commands/hpexpiretime.json b/src/commands/hpexpiretime.json new file mode 100644 index 00000000000..67406cb7dad --- /dev/null +++ b/src/commands/hpexpiretime.json @@ -0,0 +1,84 @@ +{ + "HPEXPIRETIME": { + "summary": "Returns the expiration time of a hash field as a Unix timestamp, in msec.", + "complexity": "O(N) where N is the number of specified fields", + "group": "hash", + "since": "7.4.0", + "arity": -5, + "function": "hpexpiretimeCommand", + "history": [], + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Array of results. Returns empty array if the key does not exist.", + "type": "array", + "minItems": 0, + "maxItems": 4294967295, + "items": { + "oneOf": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "The field exists but has no associated expire.", + "const": -1 + }, + { + "description": "Expiration Unix timestamp in milliseconds.", + "type": "integer", + "minimum": 1 + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "fields", + "token": "FIELDS", + "type": "block", + "arguments": [ + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } + ] + } +} diff --git a/src/commands/hpttl.json b/src/commands/hpttl.json new file mode 100644 index 00000000000..9f24bec8f3e --- /dev/null +++ b/src/commands/hpttl.json @@ -0,0 +1,84 @@ +{ + "HPTTL": { + "summary": "Returns the TTL in milliseconds of a hash field.", + "complexity": "O(N) where N is the number of specified fields", + "group": "hash", + "since": "7.4.0", + "arity": -5, + "function": "hpttlCommand", + "history": [], + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Array of results. Returns empty array if the key does not exist.", + "type": "array", + "minItems": 0, + "maxItems": 4294967295, + "items": { + "oneOf": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "The field exists but has no associated expire.", + "const": -1 + }, + { + "description": "TTL in milliseconds.", + "type": "integer", + "minimum": 1 + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "fields", + "token": "FIELDS", + "type": "block", + "arguments": [ + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } + ] + } +} diff --git a/src/commands/hscan.json b/src/commands/hscan.json index 0888eec9fce..99e916574f6 100644 --- a/src/commands/hscan.json +++ b/src/commands/hscan.json @@ -56,6 +56,12 @@ "name": "count", "type": "integer", "optional": true + }, + { + "token": "NOVALUES", + "name": "novalues", + "type": "pure-token", + "optional": true } ], "reply_schema": { @@ -69,7 +75,7 @@ "type": "string" }, { - "description": "list of key/value pairs from the hash where each even element is the key, and each odd element is the value", + "description": "list of key/value pairs from the hash where each even element is the key, and each odd element is the value, or when novalues option is on, a list of keys from the hash", "type": "array", "items": { "type": "string" diff --git a/src/commands/httl.json b/src/commands/httl.json new file mode 100644 index 00000000000..e0e865056af --- /dev/null +++ b/src/commands/httl.json @@ -0,0 +1,84 @@ +{ + "HTTL": { + "summary": "Returns the TTL in seconds of a hash field.", + "complexity": "O(N) where N is the number of specified fields", + "group": "hash", + "since": "7.4.0", + "arity": -5, + "function": "httlCommand", + "history": [], + "command_flags": [ + "READONLY", + "FAST" + ], + "acl_categories": [ + "HASH" + ], + "key_specs": [ + { + "flags": [ + "RO", + "ACCESS" + ], + "begin_search": { + "index": { + "pos": 1 + } + }, + "find_keys": { + "range": { + "lastkey": 0, + "step": 1, + "limit": 0 + } + } + } + ], + "reply_schema": { + "description": "Array of results. Returns empty array if the key does not exist.", + "type": "array", + "minItems": 0, + "maxItems": 4294967295, + "items": { + "oneOf": [ + { + "description": "The field does not exist.", + "const": -2 + }, + { + "description": "The field exists but has no associated expire.", + "const": -1 + }, + { + "description": "TTL in seconds.", + "type": "integer", + "minimum": 1 + } + ] + } + }, + "arguments": [ + { + "name": "key", + "type": "key", + "key_spec_index": 0 + }, + { + "name": "fields", + "token": "FIELDS", + "type": "block", + "arguments": [ + { + "name": "numfields", + "type": "integer" + }, + { + "name": "field", + "type": "string", + "multiple": true + } + ] + } + ] + } +} diff --git a/src/commands/latency-reset.json b/src/commands/latency-reset.json index d4891da5edc..322328277c7 100644 --- a/src/commands/latency-reset.json +++ b/src/commands/latency-reset.json @@ -15,7 +15,7 @@ ], "command_tips": [ "REQUEST_POLICY:ALL_NODES", - "RESPONSE_POLICY:ALL_SUCCEEDED" + "RESPONSE_POLICY:AGG_SUM" ], "reply_schema": { "type": "integer", diff --git a/src/commands/memory-stats.json b/src/commands/memory-stats.json index de82dc8cc59..98e49b7d271 100644 --- a/src/commands/memory-stats.json +++ b/src/commands/memory-stats.json @@ -47,9 +47,18 @@ "functions.caches": { "type": "integer" }, + "overhead.db.hashtable.lut": { + "type": "integer" + }, + "overhead.db.hashtable.rehashing": { + "type": "integer" + }, "overhead.total": { "type": "integer" }, + "db.dict.rehashing.count": { + "type": "integer" + }, "keys.count": { "type": "integer" }, @@ -74,6 +83,9 @@ "allocator.resident": { "type": "integer" }, + "allocator.muzzy": { + "type": "integer" + }, "allocator-fragmentation.ratio": { "type": "number" }, @@ -100,7 +112,7 @@ } }, "patternProperties": { - "^db.": { + "^db\\.\\d+$": { "type": "object", "properties": { "overhead.hashtable.main": { @@ -108,9 +120,6 @@ }, "overhead.hashtable.expires": { "type": "integer" - }, - "overhead.hashtable.slot-to-keys": { - "type": "integer" } }, "additionalProperties": false diff --git a/src/commands/msetnx.json b/src/commands/msetnx.json index fa71d2b45bc..27592d3044a 100644 --- a/src/commands/msetnx.json +++ b/src/commands/msetnx.json @@ -13,10 +13,6 @@ "acl_categories": [ "STRING" ], - "command_tips": [ - "REQUEST_POLICY:MULTI_SHARD", - "RESPONSE_POLICY:AGG_MIN" - ], "key_specs": [ { "flags": [ diff --git a/src/commands/randomkey.json b/src/commands/randomkey.json index e8773ee6b01..eeef61aef17 100644 --- a/src/commands/randomkey.json +++ b/src/commands/randomkey.json @@ -15,6 +15,7 @@ ], "command_tips": [ "REQUEST_POLICY:ALL_SHARDS", + "RESPONSE_POLICY:SPECIAL", "NONDETERMINISTIC_OUTPUT" ], "reply_schema": { diff --git a/src/commands/replicaof.json b/src/commands/replicaof.json index aa49390197c..95e5cb400b8 100644 --- a/src/commands/replicaof.json +++ b/src/commands/replicaof.json @@ -14,12 +14,40 @@ ], "arguments": [ { - "name": "host", - "type": "string" - }, - { - "name": "port", - "type": "integer" + "name": "args", + "type": "oneof", + "arguments": [ + { + "name": "host-port", + "type": "block", + "arguments": [ + { + "name": "host", + "type": "string" + }, + { + "name": "port", + "type": "integer" + } + ] + }, + { + "name": "no-one", + "type": "block", + "arguments": [ + { + "name": "no", + "type": "pure-token", + "token": "NO" + }, + { + "name": "one", + "type": "pure-token", + "token": "ONE" + } + ] + } + ] } ], "reply_schema": { diff --git a/src/commands/scan.json b/src/commands/scan.json index ca9adf5b44b..a7df78a218b 100644 --- a/src/commands/scan.json +++ b/src/commands/scan.json @@ -21,7 +21,8 @@ ], "command_tips": [ "NONDETERMINISTIC_OUTPUT", - "REQUEST_POLICY:SPECIAL" + "REQUEST_POLICY:SPECIAL", + "RESPONSE_POLICY:SPECIAL" ], "arguments": [ { diff --git a/src/commands/sinterstore.json b/src/commands/sinterstore.json index 28ccfff691e..e8e4bb44746 100644 --- a/src/commands/sinterstore.json +++ b/src/commands/sinterstore.json @@ -16,7 +16,7 @@ "key_specs": [ { "flags": [ - "RW", + "OW", "UPDATE" ], "begin_search": { diff --git a/src/commands/slaveof.json b/src/commands/slaveof.json index 9790730b25e..6595960f949 100644 --- a/src/commands/slaveof.json +++ b/src/commands/slaveof.json @@ -19,12 +19,40 @@ ], "arguments": [ { - "name": "host", - "type": "string" - }, - { - "name": "port", - "type": "integer" + "name": "args", + "type": "oneof", + "arguments": [ + { + "name": "host-port", + "type": "block", + "arguments": [ + { + "name": "host", + "type": "string" + }, + { + "name": "port", + "type": "integer" + } + ] + }, + { + "name": "no-one", + "type": "block", + "arguments": [ + { + "name": "no", + "type": "pure-token", + "token": "NO" + }, + { + "name": "one", + "type": "pure-token", + "token": "ONE" + } + ] + } + ] } ], "reply_schema": { diff --git a/src/commands/sort.json b/src/commands/sort.json index 5e117c9501a..d5f6511142d 100644 --- a/src/commands/sort.json +++ b/src/commands/sort.json @@ -150,7 +150,7 @@ "type": "string" }, { - "description": "GET option is specified, but no object was found ", + "description": "GET option is specified, but no object was found", "type": "null" } ] diff --git a/src/commands/sort_ro.json b/src/commands/sort_ro.json index 8b32b17fa1f..04cc3c8417a 100644 --- a/src/commands/sort_ro.json +++ b/src/commands/sort_ro.json @@ -117,7 +117,15 @@ "description": "a list of sorted elements", "type": "array", "items": { - "type": "string" + "oneOf": [ + { + "type": "string" + }, + { + "description": "GET option is specified, but no object was found", + "type": "null" + } + ] } } } diff --git a/src/commands/wait.json b/src/commands/wait.json index f936b924218..cb82f64956c 100644 --- a/src/commands/wait.json +++ b/src/commands/wait.json @@ -7,6 +7,7 @@ "arity": 3, "function": "waitCommand", "command_flags": [ + "BLOCKING" ], "acl_categories": [ "CONNECTION" diff --git a/src/commands/waitaof.json b/src/commands/waitaof.json index 735a8f261f0..19b514c2741 100644 --- a/src/commands/waitaof.json +++ b/src/commands/waitaof.json @@ -7,7 +7,7 @@ "arity": 4, "function": "waitaofCommand", "command_flags": [ - "NOSCRIPT" + "BLOCKING" ], "acl_categories": [ "CONNECTION" diff --git a/src/commands/xgroup-create.json b/src/commands/xgroup-create.json index 6b11a1f002c..119d7f300bd 100644 --- a/src/commands/xgroup-create.json +++ b/src/commands/xgroup-create.json @@ -72,8 +72,9 @@ "optional": true }, { + "name": "entriesread", + "display": "entries-read", "token": "ENTRIESREAD", - "name": "entries-read", "type": "integer", "optional": true } diff --git a/src/commands/xinfo-consumers.json b/src/commands/xinfo-consumers.json index b507e8e5926..8713a60b0f1 100644 --- a/src/commands/xinfo-consumers.json +++ b/src/commands/xinfo-consumers.json @@ -10,7 +10,7 @@ "history": [ [ "7.2.0", - "Added the `inactive` field." + "Added the `inactive` field, and changed the meaning of `idle`." ] ], "command_flags": [ diff --git a/src/commands/xinfo-stream.json b/src/commands/xinfo-stream.json index 018826f91a6..609dc336d56 100644 --- a/src/commands/xinfo-stream.json +++ b/src/commands/xinfo-stream.json @@ -292,7 +292,8 @@ }, "seen-time": { "description": "timestamp of the last interaction attempt of the consumer", - "type": "integer" + "type": "integer", + "minimum": 0 }, "pel-count": { "description": "number of unacknowledged entries that belong to the consumer", diff --git a/src/commands/xread.json b/src/commands/xread.json index 3a78ffb224e..95e22c494e1 100644 --- a/src/commands/xread.json +++ b/src/commands/xread.json @@ -8,8 +8,7 @@ "get_keys_function": "xreadGetKeys", "command_flags": [ "BLOCKING", - "READONLY", - "BLOCKING" + "READONLY" ], "acl_categories": [ "STREAM" diff --git a/src/config.c b/src/config.c index b26704283f0..e1868e0b2ea 100644 --- a/src/config.c +++ b/src/config.c @@ -1,31 +1,10 @@ /* Configuration file parsing and CONFIG GET/SET commands implementation. * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -1120,12 +1099,22 @@ struct rewriteConfigState *rewriteConfigReadOldFile(char *path) { if (fp == NULL && errno != ENOENT) return NULL; struct redis_stat sb; - if (fp && redis_fstat(fileno(fp),&sb) == -1) return NULL; + if (fp && redis_fstat(fileno(fp),&sb) == -1) { + fclose(fp); + return NULL; + } int linenum = -1; struct rewriteConfigState *state = rewriteConfigCreateState(); - if (fp == NULL || sb.st_size == 0) return state; + if (fp == NULL) { + return state; + } + + if (sb.st_size == 0) { + fclose(fp); + return state; + } /* Load the file content */ sds config = sdsnewlen(SDS_NOINIT,sb.st_size); @@ -2387,7 +2376,7 @@ static int isValidShutdownOnSigFlags(int val, const char **err) { static int isValidAnnouncedNodename(char *val,const char **err) { if (!(isValidAuxString(val,sdslen(val)))) { *err = "Announced human node name contained invalid character"; - return 0; + return 0; } return 1; } @@ -2468,6 +2457,12 @@ static int updatePort(const char **err) { return 1; } +static int updateDefragConfiguration(const char **err) { + UNUSED(err); + server.active_defrag_configuration_changed = 1; + return 1; +} + static int updateJemallocBgThread(const char **err) { UNUSED(err); set_jemalloc_bg_thread(server.jemalloc_bg_thread); @@ -2528,9 +2523,9 @@ static int updateAofAutoGCEnabled(const char **err) { static int updateSighandlerEnabled(const char **err) { UNUSED(err); if (server.crashlog_enabled) - setupSignalHandlers(); + setupSigSegvHandler(); else - removeSignalHandlers(); + removeSigSegvHandlers(); return 1; } @@ -3110,10 +3105,10 @@ standardConfig static_configs[] = { createStringConfig("dbfilename", NULL, MODIFIABLE_CONFIG | PROTECTED_CONFIG, ALLOW_EMPTY_STRING, server.rdb_filename, "dump.rdb", isValidDBfilename, NULL), createStringConfig("appendfilename", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.aof_filename, "appendonly.aof", isValidAOFfilename, NULL), createStringConfig("appenddirname", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.aof_dirname, "appendonlydir", isValidAOFdirname, NULL), - createStringConfig("server_cpulist", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.server_cpulist, NULL, NULL, NULL), - createStringConfig("bio_cpulist", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.bio_cpulist, NULL, NULL, NULL), - createStringConfig("aof_rewrite_cpulist", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.aof_rewrite_cpulist, NULL, NULL, NULL), - createStringConfig("bgsave_cpulist", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.bgsave_cpulist, NULL, NULL, NULL), + createStringConfig("server-cpulist", "server_cpulist", IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.server_cpulist, NULL, NULL, NULL), + createStringConfig("bio-cpulist", "bio_cpulist", IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.bio_cpulist, NULL, NULL, NULL), + createStringConfig("aof-rewrite-cpulist", "aof_rewrite_cpulist", IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.aof_rewrite_cpulist, NULL, NULL, NULL), + createStringConfig("bgsave-cpulist", "bgsave_cpulist", IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.bgsave_cpulist, NULL, NULL, NULL), createStringConfig("ignore-warnings", NULL, MODIFIABLE_CONFIG, ALLOW_EMPTY_STRING, server.ignore_warnings, "", NULL, NULL), createStringConfig("proc-title-template", NULL, MODIFIABLE_CONFIG, ALLOW_EMPTY_STRING, server.proc_title_template, CONFIG_DEFAULT_PROC_TITLE_TEMPLATE, isValidProcTitleTemplate, updateProcTitleTemplate), createStringConfig("bind-source-addr", NULL, MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.bind_source_addr, NULL, NULL, NULL), @@ -3154,15 +3149,15 @@ standardConfig static_configs[] = { createIntConfig("list-max-listpack-size", "list-max-ziplist-size", MODIFIABLE_CONFIG, INT_MIN, INT_MAX, server.list_max_listpack_size, -2, INTEGER_CONFIG, NULL, NULL), createIntConfig("tcp-keepalive", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.tcpkeepalive, 300, INTEGER_CONFIG, NULL, NULL), createIntConfig("cluster-migration-barrier", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.cluster_migration_barrier, 1, INTEGER_CONFIG, NULL, NULL), - createIntConfig("active-defrag-cycle-min", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_min, 1, INTEGER_CONFIG, NULL, NULL), /* Default: 1% CPU min (at lower threshold) */ - createIntConfig("active-defrag-cycle-max", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_max, 25, INTEGER_CONFIG, NULL, NULL), /* Default: 25% CPU max (at upper threshold) */ + createIntConfig("active-defrag-cycle-min", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_min, 1, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 1% CPU min (at lower threshold) */ + createIntConfig("active-defrag-cycle-max", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_max, 25, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 25% CPU max (at upper threshold) */ createIntConfig("active-defrag-threshold-lower", NULL, MODIFIABLE_CONFIG, 0, 1000, server.active_defrag_threshold_lower, 10, INTEGER_CONFIG, NULL, NULL), /* Default: don't defrag when fragmentation is below 10% */ - createIntConfig("active-defrag-threshold-upper", NULL, MODIFIABLE_CONFIG, 0, 1000, server.active_defrag_threshold_upper, 100, INTEGER_CONFIG, NULL, NULL), /* Default: maximum defrag force at 100% fragmentation */ + createIntConfig("active-defrag-threshold-upper", NULL, MODIFIABLE_CONFIG, 0, 1000, server.active_defrag_threshold_upper, 100, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: maximum defrag force at 100% fragmentation */ createIntConfig("lfu-log-factor", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.lfu_log_factor, 10, INTEGER_CONFIG, NULL, NULL), createIntConfig("lfu-decay-time", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.lfu_decay_time, 1, INTEGER_CONFIG, NULL, NULL), createIntConfig("replica-priority", "slave-priority", MODIFIABLE_CONFIG, 0, INT_MAX, server.slave_priority, 100, INTEGER_CONFIG, NULL, NULL), createIntConfig("repl-diskless-sync-delay", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.repl_diskless_sync_delay, 5, INTEGER_CONFIG, NULL, NULL), - createIntConfig("maxmemory-samples", NULL, MODIFIABLE_CONFIG, 1, INT_MAX, server.maxmemory_samples, 5, INTEGER_CONFIG, NULL, NULL), + createIntConfig("maxmemory-samples", NULL, MODIFIABLE_CONFIG, 1, 64, server.maxmemory_samples, 5, INTEGER_CONFIG, NULL, NULL), createIntConfig("maxmemory-eviction-tenacity", NULL, MODIFIABLE_CONFIG, 0, 100, server.maxmemory_eviction_tenacity, 10, INTEGER_CONFIG, NULL, NULL), createIntConfig("timeout", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.maxidletime, 0, INTEGER_CONFIG, NULL, NULL), /* Default client timeout: infinite */ createIntConfig("replica-announce-port", "slave-announce-port", MODIFIABLE_CONFIG, 0, 65535, server.slave_announce_port, 0, INTEGER_CONFIG, NULL, NULL), @@ -3188,6 +3183,8 @@ standardConfig static_configs[] = { createUIntConfig("maxclients", NULL, MODIFIABLE_CONFIG, 1, UINT_MAX, server.maxclients, 10000, INTEGER_CONFIG, NULL, updateMaxclients), createUIntConfig("unixsocketperm", NULL, IMMUTABLE_CONFIG, 0, 0777, server.unixsocketperm, 0, OCTAL_CONFIG, NULL, NULL), createUIntConfig("socket-mark-id", NULL, IMMUTABLE_CONFIG, 0, UINT_MAX, server.socket_mark_id, 0, INTEGER_CONFIG, NULL, NULL), + createUIntConfig("max-new-connections-per-cycle", NULL, MODIFIABLE_CONFIG, 1, 1000, server.max_new_conns_per_cycle, 10, INTEGER_CONFIG, NULL, NULL), + createUIntConfig("max-new-tls-connections-per-cycle", NULL, MODIFIABLE_CONFIG, 1, 1000, server.max_new_tls_conns_per_cycle, 1, INTEGER_CONFIG, NULL, NULL), #ifdef LOG_REQ_RES createUIntConfig("client-default-resp", NULL, IMMUTABLE_CONFIG | HIDDEN_CONFIG, 2, 3, server.client_default_resp, 2, INTEGER_CONFIG, NULL, NULL), #endif @@ -3241,10 +3238,10 @@ standardConfig static_configs[] = { createBoolConfig("tls-session-caching", NULL, MODIFIABLE_CONFIG, server.tls_ctx_config.session_caching, 1, NULL, applyTlsCfg), createStringConfig("tls-cert-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.cert_file, NULL, NULL, applyTlsCfg), createStringConfig("tls-key-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.key_file, NULL, NULL, applyTlsCfg), - createStringConfig("tls-key-file-pass", NULL, MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.key_file_pass, NULL, NULL, applyTlsCfg), + createStringConfig("tls-key-file-pass", NULL, MODIFIABLE_CONFIG | SENSITIVE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.key_file_pass, NULL, NULL, applyTlsCfg), createStringConfig("tls-client-cert-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.client_cert_file, NULL, NULL, applyTlsCfg), createStringConfig("tls-client-key-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.client_key_file, NULL, NULL, applyTlsCfg), - createStringConfig("tls-client-key-file-pass", NULL, MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.client_key_file_pass, NULL, NULL, applyTlsCfg), + createStringConfig("tls-client-key-file-pass", NULL, MODIFIABLE_CONFIG | SENSITIVE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.client_key_file_pass, NULL, NULL, applyTlsCfg), createStringConfig("tls-dh-params-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.dh_params_file, NULL, NULL, applyTlsCfg), createStringConfig("tls-ca-cert-file", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.ca_cert_file, NULL, NULL, applyTlsCfg), createStringConfig("tls-ca-cert-dir", NULL, VOLATILE_CONFIG | MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.tls_ctx_config.ca_cert_dir, NULL, NULL, applyTlsCfg), diff --git a/src/config.h b/src/config.h index 3c9a2701388..61393bd531c 100644 --- a/src/config.h +++ b/src/config.h @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __CONFIG_H @@ -40,8 +19,12 @@ #include #endif +#if defined(__APPLE__) && defined(__MAC_OS_X_VERSION_MAX_ALLOWED) && __MAC_OS_X_VERSION_MAX_ALLOWED >= 1060 +#define MAC_OS_10_6_DETECTED +#endif + /* Define redis_fstat to fstat or fstat64() */ -#if defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) +#if defined(__APPLE__) && !defined(MAC_OS_10_6_DETECTED) #define redis_fstat fstat64 #define redis_stat stat64 #else @@ -92,11 +75,13 @@ #endif /* Test for accept4() */ -#ifdef __linux__ +#if defined(__linux__) || defined(OpenBSD5_7) || \ + (__FreeBSD__ >= 10 || __FreeBSD_version >= 1000000) || \ + (defined(NetBSD8_0) || __NetBSD_Version__ >= 800000000) #define HAVE_ACCEPT4 1 #endif -#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__) +#if (defined(__APPLE__) && defined(MAC_OS_10_6_DETECTED)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__) #define HAVE_KQUEUE 1 #endif @@ -293,7 +278,7 @@ void setproctitle(const char *fmt, ...); #include #define redis_set_thread_title(name) rename_thread(find_thread(0), name) #else -#if (defined __APPLE__ && defined(MAC_OS_X_VERSION_10_7)) +#if (defined __APPLE__ && defined(__MAC_OS_X_VERSION_MAX_ALLOWED) && __MAC_OS_X_VERSION_MAX_ALLOWED >= 1070) int pthread_setname_np(const char *name); #include #define redis_set_thread_title(name) pthread_setname_np(name) diff --git a/src/connection.h b/src/connection.h index d0a17ab4dd6..a8c296d156a 100644 --- a/src/connection.h +++ b/src/connection.h @@ -1,31 +1,10 @@ /* - * Copyright (c) 2019, Redis Labs + * Copyright (c) 2019-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __REDIS_CONNECTION_H @@ -40,7 +19,6 @@ #define CONN_INFO_LEN 32 #define CONN_ADDR_STR_LEN 128 /* Similar to INET6_ADDRSTRLEN, hoping to handle other protocols. */ -#define MAX_ACCEPTS_PER_CALL 1000 struct aeEventLoop; typedef struct connection connection; diff --git a/src/connhelpers.h b/src/connhelpers.h index b32e44dba06..79737fa8e9a 100644 --- a/src/connhelpers.h +++ b/src/connhelpers.h @@ -1,31 +1,10 @@ /* - * Copyright (c) 2019, Redis Labs + * Copyright (c) 2019-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __REDIS_CONNHELPERS_H diff --git a/src/crc16.c b/src/crc16.c index 7b8c1dad0a1..d9e4f3f4997 100644 --- a/src/crc16.c +++ b/src/crc16.c @@ -2,7 +2,7 @@ /* * Copyright 2001-2010 Georges Menie (www.menie.org) - * Copyright 2010-2012 Salvatore Sanfilippo (adapted to Redis coding style) + * Copyright 2010-current Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/crc16_slottable.h b/src/crc16_slottable.h index 652aea9e1fb..f25e2412e89 100644 --- a/src/crc16_slottable.h +++ b/src/crc16_slottable.h @@ -7,8 +7,9 @@ * The array indexes are slot numbers, so that given a desired slot, this string is guaranteed * to make redis cluster route a request to the shard holding this slot */ +typedef char crc16_alphastring[4]; -const char *crc16_slot_table[] = { +const crc16_alphastring crc16_slot_table[] = { "06S", "Qi", "5L5", "4Iu", "4gY", "460", "1Y7", "1LV", "0QG", "ru", "7Ok", "4ji", "4DE", "65n", "2JH", "I8", "F9", "SX", "7nF", "4KD", "4eh", "6PK", "2ke", "1Ng", "0Sv", "4L", "491", "4hX", "4Ft", "5C4", "2Hy", "09R", "021", "0cX", "4Xv", "6mU", "6Cy", "42R", "0Mt", "nF", "cv", "1Pe", "5kK", "6NI", "74L", "4UF", "0nh", "MZ", "2TJ", "0ai", "4ZG", "6od", "6AH", "40c", "0OE", "lw", "aG", "0Bu", "5iz", "6Lx", diff --git a/src/db.c b/src/db.c index 4c8c0d287f4..4c5bbd88108 100644 --- a/src/db.c +++ b/src/db.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -36,6 +15,7 @@ #include #include +#include "bio.h" /*----------------------------------------------------------------------------- * C-level DB API @@ -45,7 +25,14 @@ #define EXPIRE_FORCE_DELETE_EXPIRED 1 #define EXPIRE_AVOID_DELETE_EXPIRED 2 -int expireIfNeeded(redisDb *db, robj *key, int flags); +/* Return values for expireIfNeeded */ +typedef enum { + KEY_VALID = 0, /* Could be volatile and not yet expired, non-volatile, or even non-existing key. */ + KEY_EXPIRED, /* Logically expired but not yet deleted. */ + KEY_DELETED /* The key was deleted now. */ +} keyStatus; + +keyStatus expireIfNeeded(redisDb *db, robj *key, int flags); int keyIsExpired(redisDb *db, robj *key); static void dbSetValue(redisDb *db, robj *key, robj *val, int overwrite, dictEntry *de); @@ -86,7 +73,7 @@ void updateLFU(robj *val) { * expired on replicas even if the master is lagging expiring our key via DELs * in the replication link. */ robj *lookupKey(redisDb *db, robj *key, int flags) { - dictEntry *de = dictFind(db->dict,key->ptr); + dictEntry *de = dbFind(db, key->ptr); robj *val = NULL; if (de) { val = dictGetVal(de); @@ -104,7 +91,7 @@ robj *lookupKey(redisDb *db, robj *key, int flags) { expire_flags |= EXPIRE_FORCE_DELETE_EXPIRED; if (flags & LOOKUP_NOEXPIRE) expire_flags |= EXPIRE_AVOID_DELETE_EXPIRED; - if (expireIfNeeded(db, key, expire_flags)) { + if (expireIfNeeded(db, key, expire_flags) != KEY_VALID) { /* The key is no longer valid. */ val = NULL; } @@ -188,26 +175,50 @@ robj *lookupKeyWriteOrReply(client *c, robj *key, robj *reply) { /* Add the key to the DB. It's up to the caller to increment the reference * counter of the value if needed. * - * If the update_if_existing argument is false, the the program is aborted - * if the key already exists, otherwise, it can fall back to dbOverwite. */ -static void dbAddInternal(redisDb *db, robj *key, robj *val, int update_if_existing) { + * If the update_if_existing argument is false, the program is aborted + * if the key already exists, otherwise, it can fall back to dbOverwrite. */ +static dictEntry *dbAddInternal(redisDb *db, robj *key, robj *val, int update_if_existing) { dictEntry *existing; - dictEntry *de = dictAddRaw(db->dict, key->ptr, &existing); + int slot = getKeySlot(key->ptr); + dictEntry *de = kvstoreDictAddRaw(db->keys, slot, key->ptr, &existing); if (update_if_existing && existing) { dbSetValue(db, key, val, 1, existing); - return; + return existing; } serverAssertWithInfo(NULL, key, de != NULL); - dictSetKey(db->dict, de, sdsdup(key->ptr)); + kvstoreDictSetKey(db->keys, slot, de, sdsdup(key->ptr)); initObjectLRUOrLFU(val); - dictSetVal(db->dict, de, val); + kvstoreDictSetVal(db->keys, slot, de, val); signalKeyAsReady(db, key, val->type); - if (server.cluster_enabled) slotToKeyAddEntry(de, db); notifyKeyspaceEvent(NOTIFY_NEW,"new",key,db->id); + return de; } -void dbAdd(redisDb *db, robj *key, robj *val) { - dbAddInternal(db, key, val, 0); +dictEntry *dbAdd(redisDb *db, robj *key, robj *val) { + return dbAddInternal(db, key, val, 0); +} + +/* Returns key's hash slot when cluster mode is enabled, or 0 when disabled. + * The only difference between this function and getKeySlot, is that it's not using cached key slot from the current_client + * and always calculates CRC hash. + * This is useful when slot needs to be calculated for a key that user didn't request for, such as in case of eviction. */ +int calculateKeySlot(sds key) { + return server.cluster_enabled ? keyHashSlot(key, (int) sdslen(key)) : 0; +} + +/* Return slot-specific dictionary for key based on key's hash slot when cluster mode is enabled, else 0.*/ +int getKeySlot(sds key) { + /* This is performance optimization that uses pre-set slot id from the current command, + * in order to avoid calculation of the key hash. + * This optimization is only used when current_client flag `CLIENT_EXECUTING_COMMAND` is set. + * It only gets set during the execution of command under `call` method. Other flows requesting + * the key slot would fallback to calculateKeySlot. + */ + if (server.current_client && server.current_client->slot >= 0 && server.current_client->flags & CLIENT_EXECUTING_COMMAND) { + debugServerAssertWithInfo(server.current_client, NULL, calculateKeySlot(key)==server.current_client->slot); + return server.current_client->slot; + } + return calculateKeySlot(key); } /* This is a special version of dbAdd() that is used only when loading @@ -222,11 +233,11 @@ void dbAdd(redisDb *db, robj *key, robj *val) { * ownership of the SDS string, otherwise 0 is returned, and is up to the * caller to free the SDS string. */ int dbAddRDBLoad(redisDb *db, sds key, robj *val) { - dictEntry *de = dictAddRaw(db->dict, key, NULL); + int slot = getKeySlot(key); + dictEntry *de = kvstoreDictAddRaw(db->keys, slot, key, NULL); if (de == NULL) return 0; initObjectLRUOrLFU(val); - dictSetVal(db->dict, de, val); - if (server.cluster_enabled) slotToKeyAddEntry(de, db); + kvstoreDictSetVal(db->keys, slot, de, val); return 1; } @@ -243,7 +254,8 @@ int dbAddRDBLoad(redisDb *db, sds key, robj *val) { * * The program is aborted if the key was not already present. */ static void dbSetValue(redisDb *db, robj *key, robj *val, int overwrite, dictEntry *de) { - if (!de) de = dictFind(db->dict,key->ptr); + int slot = getKeySlot(key->ptr); + if (!de) de = kvstoreDictFind(db->keys, slot, key->ptr); serverAssertWithInfo(NULL,key,de != NULL); robj *old = dictGetVal(de); @@ -263,13 +275,16 @@ static void dbSetValue(redisDb *db, robj *key, robj *val, int overwrite, dictEnt /* Because of RM_StringDMA, old may be changed, so we need get old again */ old = dictGetVal(de); } - dictSetVal(db->dict, de, val); + kvstoreDictSetVal(db->keys, slot, de, val); + + /* if hash with HFEs, take care to remove from global HFE DS */ + if (old->type == OBJ_HASH) + hashTypeRemoveFromExpires(&db->hexpires, old); if (server.lazyfree_lazy_server_del) { freeObjAsync(key,old,db->id); } else { - /* This is just decrRefCount(old); */ - db->dict->type->valDestructor(db->dict, old); + decrRefCount(old); } } @@ -321,18 +336,18 @@ void setKey(client *c, redisDb *db, robj *key, robj *val, int flags) { robj *dbRandomKey(redisDb *db) { dictEntry *de; int maxtries = 100; - int allvolatile = dictSize(db->dict) == dictSize(db->expires); + int allvolatile = kvstoreSize(db->keys) == kvstoreSize(db->expires); while(1) { sds key; robj *keyobj; - - de = dictGetFairRandomKey(db->dict); + int randomSlot = kvstoreGetFairRandomDictIndex(db->keys); + de = kvstoreDictGetFairRandomKey(db->keys, randomSlot); if (de == NULL) return NULL; key = dictGetKey(de); keyobj = createStringObject(key,sdslen(key)); - if (dictFind(db->expires,key)) { + if (dbFindExpires(db, key)) { if (allvolatile && server.masterhost && --maxtries == 0) { /* If the DB is composed only of keys with an expire set, * it could happen that all the keys are already logically @@ -344,7 +359,7 @@ robj *dbRandomKey(redisDb *db) { * return a key name that may be already expired. */ return keyobj; } - if (expireIfNeeded(db,keyobj,0)) { + if (expireIfNeeded(db,keyobj,0) != KEY_VALID) { decrRefCount(keyobj); continue; /* search for another key. This expired. */ } @@ -357,9 +372,15 @@ robj *dbRandomKey(redisDb *db) { int dbGenericDelete(redisDb *db, robj *key, int async, int flags) { dictEntry **plink; int table; - dictEntry *de = dictTwoPhaseUnlinkFind(db->dict,key->ptr,&plink,&table); + int slot = getKeySlot(key->ptr); + dictEntry *de = kvstoreDictTwoPhaseUnlinkFind(db->keys, slot, key->ptr, &plink, &table); if (de) { robj *val = dictGetVal(de); + + /* If hash object with expiry on fields, remove it from HFE DS of DB */ + if (val->type == OBJ_HASH) + hashTypeRemoveFromExpires(&db->hexpires, val); + /* RM_StringDMA may call dbUnshareStringValue which may free val, so we * need to incr to retain val */ incrRefCount(val); @@ -373,14 +394,13 @@ int dbGenericDelete(redisDb *db, robj *key, int async, int flags) { if (async) { /* Because of dbUnshareStringValue, the val in de may change. */ freeObjAsync(key, dictGetVal(de), db->id); - dictSetVal(db->dict, de, NULL); + kvstoreDictSetVal(db->keys, slot, de, NULL); } - if (server.cluster_enabled) slotToKeyDelEntry(de, db); - /* Deleting an entry from the expires dict will not free the sds of - * the key, because it is shared with the main dictionary. */ - if (dictSize(db->expires) > 0) dictDelete(db->expires,key->ptr); - dictTwoPhaseUnlinkFree(db->dict,de,plink,table); + * the key, because it is shared with the main dictionary. */ + kvstoreDictDelete(db->expires, slot, key->ptr); + + kvstoreDictTwoPhaseUnlinkFree(db->keys, slot, de, plink, table); return 1; } else { return 0; @@ -462,12 +482,15 @@ long long emptyDbStructure(redisDb *dbarray, int dbnum, int async, } for (int j = startdb; j <= enddb; j++) { - removed += dictSize(dbarray[j].dict); + removed += kvstoreSize(dbarray[j].keys); if (async) { emptyDbAsync(&dbarray[j]); } else { - dictEmpty(dbarray[j].dict,callback); - dictEmpty(dbarray[j].expires,callback); + /* Destroy global HFE DS before deleting the hashes since ebuckets + * DS is embedded in the stored objects. */ + ebDestroy(&dbarray[j].hexpires, &hashExpireBucketsType, NULL); + kvstoreEmpty(dbarray[j].keys, callback); + kvstoreEmpty(dbarray[j].expires, callback); } /* Because all keys of database are removed, reset average ttl. */ dbarray[j].avg_ttl = 0; @@ -516,11 +539,6 @@ long long emptyData(int dbnum, int flags, void(callback)(dict*)) { /* Empty redis database structure. */ removed = emptyDbStructure(server.db, dbnum, async, callback); - /* Flush slots to keys map if enable cluster, we can flush entire - * slots to keys map whatever dbnum because only support one DB - * in cluster mode. */ - if (server.cluster_enabled) slotToKeyFlush(server.db); - if (dbnum == -1) flushSlaveKeysWithExpireList(); if (with_functions) { @@ -539,16 +557,18 @@ long long emptyData(int dbnum, int flags, void(callback)(dict*)) { /* Initialize temporary db on replica for use during diskless replication. */ redisDb *initTempDb(void) { + int slot_count_bits = 0; + int flags = KVSTORE_ALLOCATE_DICTS_ON_DEMAND; + if (server.cluster_enabled) { + slot_count_bits = CLUSTER_SLOT_MASK_BITS; + flags |= KVSTORE_FREE_EMPTY_DICTS; + } redisDb *tempDb = zcalloc(sizeof(redisDb)*server.dbnum); for (int i=0; ibstate.lazyfreeStartTime), 0); + /* lazyfree bg job always succeed */ + addReply(c, shared.ok); + + /* mark client as unblocked */ + unblockClient(c, 1); + + /* FLUSH command is finished. resetClient() and update replication offset. */ + commandProcessed(c); + + /* On flush completion, update the client's memory */ + updateClientMemUsageAndBucket(c); + + /* restore current_client */ + server.current_client = old_client; +} + +void flushCommandCommon(client *c, int isFlushAll) { + int blocking_async = 0; /* FLUSHALL\FLUSHDB SYNC opt to run as blocking ASYNC */ + int flags; if (getFlushCommandFlags(c,&flags) == C_ERR) return; - /* flushdb should not flush the functions */ - server.dirty += emptyData(c->db->id,flags | EMPTYDB_NOFUNCTIONS,NULL); - /* Without the forceCommandPropagation, when DB was already empty, - * FLUSHDB will not be replicated nor put into the AOF. */ + /* in case of SYNC, check if we can optimize and run it in bg as blocking ASYNC */ + if ((!(flags & EMPTYDB_ASYNC)) && (!(c->flags & CLIENT_AVOID_BLOCKING_ASYNC_FLUSH))) { + /* Run as ASYNC */ + flags |= EMPTYDB_ASYNC; + blocking_async = 1; + } + + if (isFlushAll) + flushAllDataAndResetRDB(flags | EMPTYDB_NOFUNCTIONS); + else + server.dirty += emptyData(c->db->id,flags | EMPTYDB_NOFUNCTIONS,NULL); + + /* Without the forceCommandPropagation, when DB(s) was already empty, + * FLUSHALL\FLUSHDB will not be replicated nor put into the AOF. */ forceCommandPropagation(c, PROPAGATE_REPL | PROPAGATE_AOF); - addReply(c,shared.ok); + /* if blocking ASYNC, block client and add completion job request to BIO lazyfree + * worker's queue. To be called and reply with OK only after all preceding pending + * lazyfree jobs in queue were processed */ + if (blocking_async) { + /* measure bg job till completion as elapsed time of flush command */ + elapsedStart(&c->bstate.lazyfreeStartTime); + c->bstate.timeout = 0; + blockClient(c,BLOCKED_LAZYFREE); + bioCreateCompRq(BIO_WORKER_LAZY_FREE, flushallSyncBgDone, c->id); + } else { + addReply(c, shared.ok); + } #if defined(USE_JEMALLOC) /* jemalloc 5 doesn't release pages back to the OS when there's no traffic. * for large databases, flushdb blocks for long anyway, so a bit more won't - * harm and this way the flush and purge will be synchronous. */ - if (!(flags & EMPTYDB_ASYNC)) + * harm and this way the flush and purge will be synchronous. + * + * Take care purge only FLUSHDB for sync flow. FLUSHALL sync flow already + * applied at flushAllDataAndResetRDB. Async flow will apply only later on */ + if ((!isFlushAll) && (!(flags & EMPTYDB_ASYNC))) { + /* Only clear the current thread cache. + * Ignore the return call since this will fail if the tcache is disabled. */ + je_mallctl("thread.tcache.flush", NULL, NULL, NULL, 0); + jemalloc_purge(); + } #endif } -/* FLUSHALL [ASYNC] +/* FLUSHALL [SYNC|ASYNC] * * Flushes the whole server data set. */ void flushallCommand(client *c) { - int flags; - if (getFlushCommandFlags(c,&flags) == C_ERR) return; - /* flushall should not flush the functions */ - flushAllDataAndResetRDB(flags | EMPTYDB_NOFUNCTIONS); - - /* Without the forceCommandPropagation, when DBs were already empty, - * FLUSHALL will not be replicated nor put into the AOF. */ - forceCommandPropagation(c, PROPAGATE_REPL | PROPAGATE_AOF); + flushCommandCommon(c, 1); +} - addReply(c,shared.ok); +/* FLUSHDB [SYNC|ASYNC] + * + * Flushes the currently SELECTed Redis DB. */ +void flushdbCommand(client *c) { + flushCommandCommon(c, 0); } /* This command implements DEL and UNLINK. */ @@ -719,7 +797,8 @@ void delGenericCommand(client *c, int lazy) { int numdel = 0, j; for (j = 1; j < c->argc; j++) { - expireIfNeeded(c->db,c->argv[j],0); + if (expireIfNeeded(c->db,c->argv[j],0) == KEY_DELETED) + continue; int deleted = lazy ? dbAsyncDelete(c->db,c->argv[j]) : dbSyncDelete(c->db,c->argv[j]); if (deleted) { @@ -783,17 +862,29 @@ void randomkeyCommand(client *c) { } void keysCommand(client *c) { - dictIterator *di; dictEntry *de; sds pattern = c->argv[1]->ptr; - int plen = sdslen(pattern), allkeys; + int plen = sdslen(pattern), allkeys, pslot = -1; unsigned long numkeys = 0; void *replylen = addReplyDeferredLen(c); - - di = dictGetSafeIterator(c->db->dict); allkeys = (pattern[0] == '*' && plen == 1); + if (server.cluster_enabled && !allkeys) { + pslot = patternHashSlot(pattern, plen); + } + kvstoreDictIterator *kvs_di = NULL; + kvstoreIterator *kvs_it = NULL; + if (pslot != -1) { + if (!kvstoreDictSize(c->db->keys, pslot)) { + /* Requested slot is empty */ + setDeferredArrayLen(c,replylen,0); + return; + } + kvs_di = kvstoreGetDictSafeIterator(c->db->keys, pslot); + } else { + kvs_it = kvstoreIteratorInit(c->db->keys); + } robj keyobj; - while((de = dictNext(di)) != NULL) { + while ((de = kvs_di ? kvstoreDictIteratorNext(kvs_di) : kvstoreIteratorNext(kvs_it)) != NULL) { sds key = dictGetKey(de); if (allkeys || stringmatchlen(pattern,plen,key,sdslen(key),0)) { @@ -806,7 +897,10 @@ void keysCommand(client *c) { if (c->flags & CLIENT_CLOSE_ASAP) break; } - dictReleaseIterator(di); + if (kvs_di) + kvstoreReleaseDictIterator(kvs_di); + if (kvs_it) + kvstoreIteratorRelease(kvs_it); setDeferredArrayLen(c,replylen,numkeys); } @@ -817,6 +911,8 @@ typedef struct { long long type; /* the particular type when scan the db */ sds pattern; /* pattern string, NULL means no pattern */ long sampled; /* cumulative number of keys sampled */ + int no_values; /* set to 1 means to return keys only */ + size_t (*strlen)(char *s); /* (o->type == OBJ_HASH) ? hfieldlen : sdslen */ } scanData; /* Helper function to compare key type in scan commands */ @@ -841,7 +937,7 @@ void scanCallback(void *privdata, const dictEntry *de) { list *keys = data->keys; robj *o = data->o; sds val = NULL; - sds key = NULL; + void *key = NULL; /* if OBJ_HASH then key is of type `hfield`. Otherwise, `sds` */ data->sampled++; /* o and typename can not have values at the same time. */ @@ -854,46 +950,44 @@ void scanCallback(void *privdata, const dictEntry *de) { } /* Filter element if it does not match the pattern. */ - sds keysds = dictGetKey(de); + void *keyStr = dictGetKey(de); if (data->pattern) { - if (!stringmatchlen(data->pattern, sdslen(data->pattern), keysds, sdslen(keysds), 0)) { + if (!stringmatchlen(data->pattern, sdslen(data->pattern), keyStr, data->strlen(keyStr), 0)) { return; } } if (o == NULL) { - key = keysds; + key = keyStr; } else if (o->type == OBJ_SET) { - key = keysds; + key = keyStr; } else if (o->type == OBJ_HASH) { - key = keysds; + key = keyStr; val = dictGetVal(de); + + /* If field is expired, then ignore */ + if (hfieldIsExpired(key)) + return; + } else if (o->type == OBJ_ZSET) { char buf[MAX_LONG_DOUBLE_CHARS]; int len = ld2string(buf, sizeof(buf), *(double *)dictGetVal(de), LD_STR_AUTO); - key = sdsdup(keysds); + key = sdsdup(keyStr); val = sdsnewlen(buf, len); } else { serverPanic("Type not handled in SCAN callback."); } listAddNodeTail(keys, key); - if (val) listAddNodeTail(keys, val); + if (val && !data->no_values) listAddNodeTail(keys, val); } /* Try to parse a SCAN cursor stored at object 'o': * if the cursor is valid, store it as unsigned integer into *cursor and * returns C_OK. Otherwise return C_ERR and send an error to the * client. */ -int parseScanCursorOrReply(client *c, robj *o, unsigned long *cursor) { - char *eptr; - - /* Use strtoul() because we need an *unsigned* long, so - * getLongLongFromObject() does not cover the whole cursor space. */ - errno = 0; - *cursor = strtoul(o->ptr, &eptr, 10); - if (isspace(((char*)o->ptr)[0]) || eptr[0] != '\0' || errno == ERANGE) - { +int parseScanCursorOrReply(client *c, robj *o, unsigned long long *cursor) { + if (!string2ull(o->ptr, cursor)) { addReplyError(c, "invalid cursor"); return C_ERR; } @@ -951,14 +1045,15 @@ char *getObjectTypeName(robj *o) { * * In the case of a Hash object the function returns both the field and value * of every element on the Hash. */ -void scanGenericCommand(client *c, robj *o, unsigned long cursor) { +void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { + int isKeysHfield = 0; int i, j; listNode *node; long count = 10; sds pat = NULL; sds typename = NULL; long long type = LLONG_MAX; - int patlen = 0, use_pattern = 0; + int patlen = 0, use_pattern = 0, no_values = 0; dict *ht; /* Object must be NULL (to iterate keys names), or the type of the object @@ -1003,6 +1098,13 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { return; } i+= 2; + } else if (!strcasecmp(c->argv[i]->ptr, "novalues")) { + if (!o || o->type != OBJ_HASH) { + addReplyError(c, "NOVALUES option can only be used in HSCAN"); + return; + } + no_values = 1; + i++; } else { addReplyErrorObject(c,shared.syntaxerr); return; @@ -1020,10 +1122,11 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { /* Handle the case of a hash table. */ ht = NULL; if (o == NULL) { - ht = c->db->dict; + ht = NULL; } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HT) { ht = o->ptr; } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) { + isKeysHfield = 1; ht = o->ptr; } else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; @@ -1043,7 +1146,8 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { listSetFreeMethod(keys, (void (*)(void*))sdsfree); } - if (ht) { + /* For main dictionary scan or data structure using hashtable. */ + if (!o || ht) { /* We set the max number of iterations to ten times the specified * COUNT, so if the hash table is in a pathological state (very * sparsely populated) we avoid to block too much time at the cost @@ -1056,20 +1160,36 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { * it is possible to fetch more data in a type-dependent way; * 3. data.type: the specified type scan in the db, LLONG_MAX means * type matching is no needed; - * 4. data.pattern: the pattern string + * 4. data.pattern: the pattern string; * 5. data.sampled: the maxiteration limit is there in case we're * working on an empty dict, one with a lot of empty buckets, and * for the buckets are not empty, we need to limit the spampled number - * to prevent a long hang time caused by filtering too many keys*/ + * to prevent a long hang time caused by filtering too many keys; + * 6. data.no_values: to control whether values will be returned or + * only keys are returned. */ scanData data = { .keys = keys, .o = o, .type = type, .pattern = use_pattern ? pat : NULL, .sampled = 0, + .no_values = no_values, + .strlen = (isKeysHfield) ? hfieldlen : sdslen, }; + + /* A pattern may restrict all matching keys to one cluster slot. */ + int onlydidx = -1; + if (o == NULL && use_pattern && server.cluster_enabled) { + onlydidx = patternHashSlot(pat, patlen); + } do { - cursor = dictScan(ht, cursor, scanCallback, &data); + /* In cluster mode there is a separate dictionary for each slot. + * If cursor is empty, we should try exploring next non-empty slot. */ + if (o == NULL) { + cursor = kvstoreScan(c->db->keys, cursor, onlydidx, scanCallback, NULL, &data); + } else { + cursor = dictScan(ht, cursor, scanCallback, &data); + } } while (cursor && maxiterations-- && data.sampled < count); } else if (o->type == OBJ_SET) { char *str; @@ -1109,9 +1229,45 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { /* add key object */ listAddNodeTail(keys, sdsnewlen(str, len)); /* add value object */ + if (!no_values) { + str = lpGet(p, &len, intbuf); + listAddNodeTail(keys, sdsnewlen(str, len)); + } + p = lpNext(o->ptr, p); + } + cursor = 0; + } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_LISTPACK_EX) { + int64_t len; + long long expire_at; + unsigned char *lp = hashTypeListpackGetLp(o); + unsigned char *p = lpFirst(lp); + unsigned char *str, *val; + unsigned char intbuf[LP_INTBUF_SIZE]; + + while (p) { str = lpGet(p, &len, intbuf); + p = lpNext(lp, p); + val = p; /* Keep pointer to value */ + + p = lpNext(lp, p); + serverAssert(p && lpGetIntegerValue(p, &expire_at)); + + if (hashTypeIsExpired(o, expire_at) || + (use_pattern && !stringmatchlen(pat, sdslen(pat), (char *)str, len, 0))) + { + /* jump to the next key/val pair */ + p = lpNext(lp, p); + continue; + } + + /* add key object */ listAddNodeTail(keys, sdsnewlen(str, len)); - p = lpNext(o->ptr, p); + /* add value object */ + if (!no_values) { + str = lpGet(val, &len, intbuf); + listAddNodeTail(keys, sdsnewlen(str, len)); + } + p = lpNext(lp, p); } cursor = 0; } else { @@ -1137,10 +1293,14 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { addReplyArrayLen(c, 2); addReplyBulkLongLong(c,cursor); + unsigned long long idx = 0; addReplyArrayLen(c, listLength(keys)); while ((node = listFirst(keys)) != NULL) { - sds key = listNodeValue(node); - addReplyBulkCBuffer(c, key, sdslen(key)); + void *key = listNodeValue(node); + /* For HSCAN, list will contain keys value pairs unless no_values arg + * was given. We should call mstrlen for the keys only. */ + int hfieldkey = isKeysHfield && (no_values || (idx++ % 2 == 0)); + addReplyBulkCBuffer(c, key, hfieldkey ? mstrlen(key) : sdslen(key)); listDelNode(keys, node); } @@ -1149,13 +1309,13 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) { /* The SCAN command completely relies on scanGenericCommand. */ void scanCommand(client *c) { - unsigned long cursor; + unsigned long long cursor; if (parseScanCursorOrReply(c,c->argv[1],&cursor) == C_ERR) return; scanGenericCommand(c,NULL,cursor); } void dbsizeCommand(client *c) { - addReplyLongLong(c,dictSize(c->db->dict)); + addReplyLongLong(c,kvstoreSize(c->db->keys)); } void lastsaveCommand(client *c) { @@ -1233,6 +1393,7 @@ void renameGenericCommand(client *c, int nx) { robj *o; long long expire; int samekey = 0; + uint64_t minHashExpireTime = EB_EXPIRE_TIME_INVALID; /* When source and dest key is the same, no operation is performed, * if the key exists, however we still return an error on unexisting key. */ @@ -1258,9 +1419,21 @@ void renameGenericCommand(client *c, int nx) { * with the same name. */ dbDelete(c->db,c->argv[2]); } - dbAdd(c->db,c->argv[2],o); + dictEntry *de = dbAdd(c->db, c->argv[2], o); if (expire != -1) setExpire(c,c->db,c->argv[2],expire); + + /* If hash with expiration on fields then remove it from global HFE DS and + * keep next expiration time. Otherwise, dbDelete() will remove it from the + * global HFE DS and we will lose the expiration time. */ + if (o->type == OBJ_HASH) + minHashExpireTime = hashTypeRemoveFromExpires(&c->db->hexpires, o); + dbDelete(c->db,c->argv[1]); + + /* If hash with HFEs, register in db->hexpires */ + if (minHashExpireTime != EB_EXPIRE_TIME_INVALID) + hashTypeAddToExpires(c->db, dictGetKey(de), o, minHashExpireTime); + signalModifiedKey(c,c->db,c->argv[1]); signalModifiedKey(c,c->db,c->argv[2]); notifyKeyspaceEvent(NOTIFY_GENERIC,"rename_from", @@ -1284,6 +1457,7 @@ void moveCommand(client *c) { redisDb *src, *dst; int srcid, dbid; long long expire; + uint64_t hashExpireTime = EB_EXPIRE_TIME_INVALID; if (server.cluster_enabled) { addReplyError(c,"MOVE is not allowed in cluster mode"); @@ -1324,12 +1498,25 @@ void moveCommand(client *c) { addReply(c,shared.czero); return; } - dbAdd(dst,c->argv[1],o); + dictEntry *dstDictEntry = dbAdd(dst,c->argv[1],o); if (expire != -1) setExpire(c,dst,c->argv[1],expire); + + /* If hash with expiration on fields, remove it from global HFE DS and keep + * aside registered expiration time. Must be before deletion of the object. + * hexpires (ebuckets) embed in stored items its structure. */ + if (o->type == OBJ_HASH) + hashExpireTime = hashTypeRemoveFromExpires(&src->hexpires, o); + incrRefCount(o); /* OK! key moved, free the entry in the source DB */ dbDelete(src,c->argv[1]); + + /* If object of type hash with expiration on fields. Taken care to add the + * hash to hexpires of `dst` only after dbDelete(). */ + if (hashExpireTime != EB_EXPIRE_TIME_INVALID) + hashTypeAddToExpires(dst, dictGetKey(dstDictEntry), o, hashExpireTime); + signalModifiedKey(c,src,c->argv[1]); signalModifiedKey(c,dst,c->argv[1]); notifyKeyspaceEvent(NOTIFY_GENERIC, @@ -1412,12 +1599,13 @@ void copyCommand(client *c) { /* Duplicate object according to object's type. */ robj *newobj; + uint64_t minHashExpire = EB_EXPIRE_TIME_INVALID; /* HFE feature */ switch(o->type) { case OBJ_STRING: newobj = dupStringObject(o); break; case OBJ_LIST: newobj = listTypeDup(o); break; case OBJ_SET: newobj = setTypeDup(o); break; case OBJ_ZSET: newobj = zsetDup(o); break; - case OBJ_HASH: newobj = hashTypeDup(o); break; + case OBJ_HASH: newobj = hashTypeDup(o, newkey->ptr, &minHashExpire); break; case OBJ_STREAM: newobj = streamDup(o); break; case OBJ_MODULE: newobj = moduleTypeDupOrReply(c, key, newkey, dst->id, o); @@ -1432,8 +1620,16 @@ void copyCommand(client *c) { dbDelete(dst,newkey); } - dbAdd(dst,newkey,newobj); - if (expire != -1) setExpire(c, dst, newkey, expire); + dictEntry *deCopy = dbAdd(dst,newkey,newobj); + + /* if key with expiration then set it */ + if (expire != -1) + setExpire(c, dst, newkey, expire); + + /* If minExpiredField was set, then the object is hash with expiration + * on fields and need to register it in global HFE DS */ + if (minHashExpire != EB_EXPIRE_TIME_INVALID) + hashTypeAddToExpires(dst, dictGetKey(deCopy), newobj, minHashExpire); /* OK! key copied */ signalModifiedKey(c,dst,c->argv[2]); @@ -1452,7 +1648,7 @@ void scanDatabaseForReadyKeys(redisDb *db) { dictIterator *di = dictGetSafeIterator(db->blocking_keys); while((de = dictNext(di)) != NULL) { robj *key = dictGetKey(de); - dictEntry *kde = dictFind(db->dict,key->ptr); + dictEntry *kde = dbFind(db, key->ptr); if (kde) { robj *value = dictGetVal(kde); signalKeyAsReady(db, key, value->type); @@ -1472,7 +1668,7 @@ void scanDatabaseForDeletedKeys(redisDb *emptied, redisDb *replaced_with) { int existed = 0, exists = 0; int original_type = -1, curr_type = -1; - dictEntry *kde = dictFind(emptied->dict, key->ptr); + dictEntry *kde = dbFind(emptied, key->ptr); if (kde) { robj *value = dictGetVal(kde); original_type = value->type; @@ -1480,7 +1676,7 @@ void scanDatabaseForDeletedKeys(redisDb *emptied, redisDb *replaced_with) { } if (replaced_with) { - dictEntry *kde = dictFind(replaced_with->dict, key->ptr); + kde = dbFind(replaced_with, key->ptr); if (kde) { robj *value = dictGetVal(kde); curr_type = value->type; @@ -1521,13 +1717,15 @@ int dbSwapDatabases(int id1, int id2) { /* Swap hash tables. Note that we don't swap blocking_keys, * ready_keys and watched_keys, since we want clients to * remain in the same DB they were. */ - db1->dict = db2->dict; + db1->keys = db2->keys; db1->expires = db2->expires; + db1->hexpires = db2->hexpires; db1->avg_ttl = db2->avg_ttl; db1->expires_cursor = db2->expires_cursor; - db2->dict = aux.dict; + db2->keys = aux.keys; db2->expires = aux.expires; + db2->hexpires = aux.hexpires; db2->avg_ttl = aux.avg_ttl; db2->expires_cursor = aux.expires_cursor; @@ -1549,13 +1747,6 @@ int dbSwapDatabases(int id1, int id2) { * database (temp) as the main (active) database, the actual freeing of old database * (which will now be placed in the temp one) is done later. */ void swapMainDbWithTempDb(redisDb *tempDb) { - if (server.cluster_enabled) { - /* Swap slots_to_keys from tempdb just loaded with main db slots_to_keys. */ - clusterSlotToKeyMapping *aux = server.db->slots_to_keys; - server.db->slots_to_keys = tempDb->slots_to_keys; - tempDb->slots_to_keys = aux; - } - for (int i=0; idict = newdb->dict; + activedb->keys = newdb->keys; activedb->expires = newdb->expires; + activedb->hexpires = newdb->hexpires; activedb->avg_ttl = newdb->avg_ttl; activedb->expires_cursor = newdb->expires_cursor; - newdb->dict = aux.dict; + newdb->keys = aux.keys; newdb->expires = aux.expires; + newdb->hexpires = aux.hexpires; newdb->avg_ttl = aux.avg_ttl; newdb->expires_cursor = aux.expires_cursor; @@ -1632,7 +1825,7 @@ void swapdbCommand(client *c) { *----------------------------------------------------------------------------*/ int removeExpire(redisDb *db, robj *key) { - return dictDelete(db->expires,key->ptr) == DICT_OK; + return kvstoreDictDelete(db->expires, getKeySlot(key->ptr), key->ptr) == DICT_OK; } /* Set an expire to the specified key. If the expire is set in the context @@ -1640,13 +1833,18 @@ int removeExpire(redisDb *db, robj *key) { * to NULL. The 'when' parameter is the absolute unix time in milliseconds * after which the key will no longer be considered valid. */ void setExpire(client *c, redisDb *db, robj *key, long long when) { - dictEntry *kde, *de; + dictEntry *kde, *de, *existing; /* Reuse the sds from the main dict in the expire dict */ - kde = dictFind(db->dict,key->ptr); + int slot = getKeySlot(key->ptr); + kde = kvstoreDictFind(db->keys, slot, key->ptr); serverAssertWithInfo(NULL,key,kde != NULL); - de = dictAddOrFind(db->expires,dictGetKey(kde)); - dictSetSignedIntegerVal(de,when); + de = kvstoreDictAddRaw(db->expires, slot, dictGetKey(kde), &existing); + if (existing) { + dictSetSignedIntegerVal(existing, when); + } else { + dictSetSignedIntegerVal(de, when); + } int writable_slave = server.masterhost && server.repl_slave_ro == 0; if (c && writable_slave && !(c->flags & CLIENT_MASTER)) @@ -1658,9 +1856,8 @@ void setExpire(client *c, redisDb *db, robj *key, long long when) { long long getExpire(redisDb *db, robj *key) { dictEntry *de; - /* No expire? return ASAP */ - if (dictSize(db->expires) == 0 || - (de = dictFind(db->expires,key->ptr)) == NULL) return -1; + if ((de = dbFindExpires(db, key->ptr)) == NULL) + return -1; return dictGetSignedIntegerVal(de); } @@ -1678,23 +1875,24 @@ void deleteExpiredKeyAndPropagate(redisDb *db, robj *keyobj) { server.stat_expiredkeys++; } -/* Propagate expires into slaves and the AOF file. - * When a key expires in the master, a DEL operation for this key is sent - * to all the slaves and the AOF file if enabled. +/* Propagate an implicit key deletion into replicas and the AOF file. + * When a key was deleted in the master by eviction, expiration or a similar + * mechanism a DEL/UNLINK operation for this key is sent + * to all the replicas and the AOF file if enabled. * - * This way the key expiry is centralized in one place, and since both - * AOF and the master->slave link guarantee operation ordering, everything - * will be consistent even if we allow write operations against expiring + * This way the key deletion is centralized in one place, and since both + * AOF and the replication link guarantee operation ordering, everything + * will be consistent even if we allow write operations against deleted * keys. * * This function may be called from: * 1. Within call(): Example: Lazy-expire on key access. * In this case the caller doesn't have to do anything * because call() handles server.also_propagate(); or - * 2. Outside of call(): Example: Active-expire, eviction. + * 2. Outside of call(): Example: Active-expire, eviction, slot ownership changed. * In this the caller must remember to call * postExecutionUnitOperations, preferably just after a - * single deletion batch, so that DELs will NOT be wrapped + * single deletion batch, so that DEL/UNLINK will NOT be wrapped * in MULTI/EXEC */ void propagateDeletion(redisDb *db, robj *key, int lazy) { robj *argv[2]; @@ -1704,7 +1902,7 @@ void propagateDeletion(redisDb *db, robj *key, int lazy) { incrRefCount(argv[0]); incrRefCount(argv[1]); - /* If the master decided to expire a key we must propagate it to replicas no matter what.. + /* If the master decided to delete a key we must propagate it to replicas no matter what. * Even if module executed a command without asking for propagation. */ int prev_replication_allowed = server.replication_allowed; server.replication_allowed = 1; @@ -1750,7 +1948,7 @@ int keyIsExpired(redisDb *db, robj *key) { * propagation of a DEL/UNLINK command in AOF / replication stream. * * On replicas, this function does not delete expired keys by default, but - * it still returns 1 if the key is logically expired. To force deletion + * it still returns KEY_EXPIRED if the key is logically expired. To force deletion * of logically expired keys even on replicas, use the EXPIRE_FORCE_DELETE_EXPIRED * flag. Note though that if the current client is executing * replicated commands from the master, keys are never considered expired. @@ -1759,11 +1957,12 @@ int keyIsExpired(redisDb *db, robj *key) { * the actual key deletion and propagation of the deletion, use the * EXPIRE_AVOID_DELETE_EXPIRED flag. * - * The return value of the function is 0 if the key is still valid, - * otherwise the function returns 1 if the key is expired. */ -int expireIfNeeded(redisDb *db, robj *key, int flags) { - if (server.lazy_expire_disabled) return 0; - if (!keyIsExpired(db,key)) return 0; + * The return value of the function is KEY_VALID if the key is still valid. + * The function returns KEY_EXPIRED if the key is expired BUT not deleted, + * or returns KEY_DELETED if the key is expired and deleted. */ +keyStatus expireIfNeeded(redisDb *db, robj *key, int flags) { + if (server.lazy_expire_disabled) return KEY_VALID; + if (!keyIsExpired(db,key)) return KEY_VALID; /* If we are running in the context of a replica, instead of * evicting the expired key from the database, we return ASAP: @@ -1773,25 +1972,25 @@ int expireIfNeeded(redisDb *db, robj *key, int flags) { * replicas. * * Still we try to return the right information to the caller, - * that is, 0 if we think the key should be still valid, 1 if - * we think the key is expired at this time. + * that is, KEY_VALID if we think the key should still be valid, + * KEY_EXPIRED if we think the key is expired but don't want to delete it at this time. * * When replicating commands from the master, keys are never considered * expired. */ if (server.masterhost != NULL) { - if (server.current_client && (server.current_client->flags & CLIENT_MASTER)) return 0; - if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return 1; + if (server.current_client && (server.current_client->flags & CLIENT_MASTER)) return KEY_VALID; + if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED; } /* In some cases we're explicitly instructed to return an indication of a * missing key without actually deleting it, even on masters. */ if (flags & EXPIRE_AVOID_DELETE_EXPIRED) - return 1; + return KEY_EXPIRED; /* If 'expire' action is paused, for whatever reason, then don't expire any key. * Typically, at the end of the pause we will properly expire the key OR we * will have failed over and the new primary will send us the expire. */ - if (isPausedActionsWithUpdate(PAUSE_ACTION_EXPIRE)) return 1; + if (isPausedActionsWithUpdate(PAUSE_ACTION_EXPIRE)) return KEY_EXPIRED; /* The key needs to be converted from static to heap before deleted */ int static_key = key->refcount == OBJ_STATIC_REFCOUNT; @@ -1803,7 +2002,68 @@ int expireIfNeeded(redisDb *db, robj *key, int flags) { if (static_key) { decrRefCount(key); } - return 1; + return KEY_DELETED; +} + +/* CB passed to kvstoreExpand. + * The purpose is to skip expansion of unused dicts in cluster mode (all + * dicts not mapped to *my* slots) */ +static int dbExpandSkipSlot(int slot) { + return !clusterNodeCoversSlot(getMyClusterNode(), slot); +} + +/* + * This functions increases size of the main/expires db to match desired number. + * In cluster mode resizes all individual dictionaries for slots that this node owns. + * + * Based on the parameter `try_expand`, appropriate dict expand API is invoked. + * if try_expand is set to 1, `dictTryExpand` is used else `dictExpand`. + * The return code is either `DICT_OK`/`DICT_ERR` for both the API(s). + * `DICT_OK` response is for successful expansion. However ,`DICT_ERR` response signifies failure in allocation in + * `dictTryExpand` call and in case of `dictExpand` call it signifies no expansion was performed. + */ +static int dbExpandGeneric(kvstore *kvs, uint64_t db_size, int try_expand) { + int ret; + if (server.cluster_enabled) { + /* We don't know exact number of keys that would fall into each slot, but we can + * approximate it, assuming even distribution, divide it by the number of slots. */ + int slots = getMyShardSlotCount(); + if (slots == 0) return C_OK; + db_size = db_size / slots; + ret = kvstoreExpand(kvs, db_size, try_expand, dbExpandSkipSlot); + } else { + ret = kvstoreExpand(kvs, db_size, try_expand, NULL); + } + + return ret? C_OK : C_ERR; +} + +int dbExpand(redisDb *db, uint64_t db_size, int try_expand) { + return dbExpandGeneric(db->keys, db_size, try_expand); +} + +int dbExpandExpires(redisDb *db, uint64_t db_size, int try_expand) { + return dbExpandGeneric(db->expires, db_size, try_expand); +} + +static dictEntry *dbFindGeneric(kvstore *kvs, void *key) { + return kvstoreDictFind(kvs, getKeySlot(key), key); +} + +dictEntry *dbFind(redisDb *db, void *key) { + return dbFindGeneric(db->keys, key); +} + +dictEntry *dbFindExpires(redisDb *db, void *key) { + return dbFindGeneric(db->expires, key); +} + +unsigned long long dbSize(redisDb *db) { + return kvstoreSize(db->keys); +} + +unsigned long long dbScan(redisDb *db, unsigned long long cursor, dictScanFunction *scan_cb, void *privdata) { + return kvstoreScan(db->keys, cursor, -1, scan_cb, NULL, privdata); } /* ----------------------------------------------------------------------------- @@ -2283,7 +2543,8 @@ int sortROGetKeys(struct redisCommand *cmd, robj **argv, int argc, getKeysResult keys = getKeysPrepareResult(result, 1); keys[0].pos = 1; /* is always present. */ keys[0].flags = CMD_KEY_RO | CMD_KEY_ACCESS; - return 1; + result->numkeys = 1; + return result->numkeys; } /* Helper function to extract keys from the SORT command. diff --git a/src/debug.c b/src/debug.c index a57b1fde9a3..b774ccc656b 100644 --- a/src/debug.c +++ b/src/debug.c @@ -1,31 +1,9 @@ /* - * Copyright (c) 2009-2020, Salvatore Sanfilippo - * Copyright (c) 2020, Redis Labs, Inc + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -36,6 +14,8 @@ #include "quicklist.h" #include "fpconv_dtoa.h" #include "cluster.h" +#include "threads_mngr.h" +#include "script.h" #include #include @@ -66,12 +46,16 @@ typedef ucontext_t sigcontext_t; /* Globals */ static int bug_report_start = 0; /* True if bug report header was already logged. */ static pthread_mutex_t bug_report_start_mutex = PTHREAD_MUTEX_INITIALIZER; - +/* Mutex for a case when two threads crash at the same time. */ +static pthread_mutex_t signal_handler_lock; +static pthread_mutexattr_t signal_handler_lock_attr; +static volatile int signal_handler_lock_initialized = 0; /* Forward declarations */ -void bugReportStart(void); +int bugReportStart(void); void printCrashReport(void); void bugReportEnd(int killViaSignal, int sig); -void logStackTrace(void *eip, int uplevel); +void logStackTrace(void *eip, int uplevel, int current_thread); +void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret); /* ================================= Debugging ============================== */ @@ -217,17 +201,22 @@ void xorObjectDigest(redisDb *db, robj *keyobj, unsigned char *digest, robj *o) } } else if (o->type == OBJ_HASH) { hashTypeIterator *hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != C_ERR) { + while (hashTypeNext(hi, 0) != C_ERR) { unsigned char eledigest[20]; sds sdsele; + /* field */ memset(eledigest,0,20); sdsele = hashTypeCurrentObjectNewSds(hi,OBJ_HASH_KEY); mixDigest(eledigest,sdsele,sdslen(sdsele)); sdsfree(sdsele); + /* val */ sdsele = hashTypeCurrentObjectNewSds(hi,OBJ_HASH_VALUE); mixDigest(eledigest,sdsele,sdslen(sdsele)); sdsfree(sdsele); + /* hash-field expiration (HFE) */ + if (hi->expire_time != EB_EXPIRE_TIME_INVALID) + xorDigest(eledigest,"!!hexpire!!",11); xorDigest(digest,eledigest,20); } hashTypeReleaseIterator(hi); @@ -276,7 +265,6 @@ void xorObjectDigest(redisDb *db, robj *keyobj, unsigned char *digest, robj *o) * a different digest. */ void computeDatasetDigest(unsigned char *final) { unsigned char digest[20]; - dictIterator *di = NULL; dictEntry *de; int j; uint32_t aux; @@ -285,17 +273,16 @@ void computeDatasetDigest(unsigned char *final) { for (j = 0; j < server.dbnum; j++) { redisDb *db = server.db+j; + if (kvstoreSize(db->keys) == 0) + continue; + kvstoreIterator *kvs_it = kvstoreIteratorInit(db->keys); - if (dictSize(db->dict) == 0) continue; - di = dictGetSafeIterator(db->dict); - - /* hash the DB id, so the same dataset moved in a different - * DB will lead to a different digest */ + /* hash the DB id, so the same dataset moved in a different DB will lead to a different digest */ aux = htonl(j); mixDigest(final,&aux,sizeof(aux)); /* Iterate this DB writing every entry */ - while((de = dictNext(di)) != NULL) { + while((de = kvstoreIteratorNext(kvs_it)) != NULL) { sds key; robj *keyobj, *o; @@ -312,7 +299,7 @@ void computeDatasetDigest(unsigned char *final) { xorDigest(final,digest,20); decrRefCount(keyobj); } - dictReleaseIterator(di); + kvstoreIteratorRelease(kvs_it); } } @@ -464,9 +451,9 @@ void debugCommand(client *c) { "SEGFAULT", " Crash the server with sigsegv.", "SET-ACTIVE-EXPIRE <0|1>", -" Setting it to 0 disables expiring keys in background when they are not", -" accessed (otherwise the Redis behavior). Setting it to 1 reenables back the", -" default.", +" Setting it to 0 disables expiring keys (and hash-fields) in background ", +" when they are not accessed (otherwise the Redis behavior). Setting it", +" to 1 reenables back the default.", "QUICKLIST-PACKED-THRESHOLD ", " Sets the threshold for elements to be inserted as plain vs packed nodes", " Default value is 1GB, allows values up to 4GB. Setting to 0 restores to default.", @@ -493,11 +480,11 @@ void debugCommand(client *c) { " In case RESET is provided the peak reset time will be restored to the default value", "REPLYBUFFER RESIZING <0|1>", " Enable or disable the reply buffer resize cron job", -"CLUSTERLINK KILL ", -" Kills the link based on the direction to/from (both) with the provided node." , +"DICT-RESIZING <0|1>", +" Enable or disable the main dict and expire dict resizing.", NULL }; - addReplyHelp(c, help); + addExtendedReplyHelp(c, help, clusterDebugCommandExtendedHelp()); } else if (!strcasecmp(c->argv[1]->ptr,"segfault")) { /* Compiler gives warnings about writing to a random address * e.g "*((char*)-1) = 'x';". As a workaround, we map a read-only area @@ -605,7 +592,7 @@ NULL robj *val; char *strenc; - if ((de = dictFind(c->db->dict,c->argv[2]->ptr)) == NULL) { + if ((de = dbFind(c->db, c->argv[2]->ptr)) == NULL) { addReplyErrorObject(c,shared.nokeyerr); return; } @@ -657,7 +644,7 @@ NULL robj *val; sds key; - if ((de = dictFind(c->db->dict,c->argv[2]->ptr)) == NULL) { + if ((de = dbFind(c->db, c->argv[2]->ptr)) == NULL) { addReplyErrorObject(c,shared.nokeyerr); return; } @@ -683,10 +670,14 @@ NULL if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.nokeyerr)) == NULL) return; - if (o->encoding != OBJ_ENCODING_LISTPACK) { + if (o->encoding != OBJ_ENCODING_LISTPACK && o->encoding != OBJ_ENCODING_LISTPACK_EX) { addReplyError(c,"Not a listpack encoded object."); } else { - lpRepr(o->ptr); + if (o->encoding == OBJ_ENCODING_LISTPACK) + lpRepr(o->ptr); + else if (o->encoding == OBJ_ENCODING_LISTPACK_EX) + lpRepr(((listpackEx*)o->ptr)->lp); + addReplyStatus(c,"Listpack structure printed on stdout"); } } else if (!strcasecmp(c->argv[1]->ptr,"quicklist") && (c->argc == 3 || c->argc == 4)) { @@ -713,7 +704,12 @@ NULL if (getPositiveLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != C_OK) return; - if (dictTryExpand(c->db->dict, keys) != DICT_OK) { + if (server.loading || server.async_loading) { + addReplyErrorObject(c, shared.loadingerr); + return; + } + + if (dbExpand(c->db, keys, 1) == C_ERR) { addReplyError(c, "OOM in dictTryExpand"); return; } @@ -761,7 +757,7 @@ NULL /* We don't use lookupKey because a debug command should * work on logically expired keys */ dictEntry *de; - robj *o = ((de = dictFind(c->db->dict,c->argv[j]->ptr)) == NULL) ? NULL : dictGetVal(de); + robj *o = ((de = dbFind(c->db, c->argv[j]->ptr)) == NULL) ? NULL : dictGetVal(de); if (o) xorObjectDigest(c->db,c->argv[j],digest,o); sds d = sdsempty(); @@ -849,7 +845,7 @@ NULL { int memerr; unsigned long long sz = memtoull((const char *)c->argv[2]->ptr, &memerr); - if (memerr || !quicklistisSetPackedThreshold(sz)) { + if (memerr || !quicklistSetPackedThreshold(sz)) { addReplyError(c, "argument must be a memory value bigger than 1 and smaller than 4gb"); } else { addReply(c,shared.ok); @@ -905,11 +901,11 @@ NULL full = 1; stats = sdscatprintf(stats,"[Dictionary HT]\n"); - dictGetStats(buf,sizeof(buf),server.db[dbid].dict,full); + kvstoreGetStats(server.db[dbid].keys, buf, sizeof(buf), full); stats = sdscat(stats,buf); stats = sdscatprintf(stats,"[Expires HT]\n"); - dictGetStats(buf,sizeof(buf),server.db[dbid].expires,full); + kvstoreGetStats(server.db[dbid].expires, buf, sizeof(buf), full); stats = sdscat(stats,buf); addReplyVerbatim(c,stats,sdslen(stats),"txt"); @@ -1015,34 +1011,33 @@ NULL return; } addReply(c, shared.ok); - } else if(!strcasecmp(c->argv[1]->ptr,"CLUSTERLINK") && - !strcasecmp(c->argv[2]->ptr,"KILL") && - c->argc == 5) { - if (!server.cluster_enabled) { - addReplyError(c, "Debug option only available for cluster mode enabled setup!"); - return; - } - - /* Find the node. */ - clusterNode *n = clusterLookupNode(c->argv[4]->ptr, sdslen(c->argv[4]->ptr)); - if (!n) { - addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[4]->ptr); - return; - } - - /* Terminate the link based on the direction or all. */ - if (!strcasecmp(c->argv[3]->ptr,"from")) { - freeClusterLink(n->inbound_link); - } else if (!strcasecmp(c->argv[3]->ptr,"to")) { - freeClusterLink(n->link); - } else if (!strcasecmp(c->argv[3]->ptr,"all")) { - freeClusterLink(n->link); - freeClusterLink(n->inbound_link); + } else if (!strcasecmp(c->argv[1]->ptr, "dict-resizing") && c->argc == 3) { + server.dict_resizing = atoi(c->argv[2]->ptr); + addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"script") && c->argc == 3) { + if (!strcasecmp(c->argv[2]->ptr,"list")) { + dictIterator *di = dictGetIterator(getLuaScripts()); + dictEntry *de; + while ((de = dictNext(di)) != NULL) { + luaScript *script = dictGetVal(de); + sds *sha = dictGetKey(de); + serverLog(LL_WARNING, "SCRIPT SHA: %s\n%s", (char*)sha, (char*)script->body->ptr); + } + dictReleaseIterator(di); + } else if (sdslen(c->argv[2]->ptr) == 40) { + dictEntry *de; + if ((de = dictFind(getLuaScripts(), c->argv[2]->ptr)) == NULL) { + addReplyErrorObject(c, shared.noscripterr); + return; + } + luaScript *script = dictGetVal(de); + serverLog(LL_WARNING, "SCRIPT SHA: %s\n%s", (char*)c->argv[2]->ptr, (char*)script->body->ptr); } else { - addReplyErrorFormat(c, "Unknown direction %s", (char*) c->argv[3]->ptr); + addReplySubcommandSyntaxError(c); + return; } addReply(c,shared.ok); - } else { + } else if(!handleDebugClusterCommand(c)) { addReplySubcommandSyntaxError(c); return; } @@ -1050,20 +1045,23 @@ NULL /* =========================== Crash handling ============================== */ +__attribute__ ((noinline)) void _serverAssert(const char *estr, const char *file, int line) { - bugReportStart(); - serverLog(LL_WARNING,"=== ASSERTION FAILED ==="); + int new_report = bugReportStart(); + serverLog(LL_WARNING,"=== %sASSERTION FAILED ===", new_report ? "" : "RECURSIVE "); serverLog(LL_WARNING,"==> %s:%d '%s' is not true",file,line,estr); if (server.crashlog_enabled) { #ifdef HAVE_BACKTRACE - logStackTrace(NULL, 1); + logStackTrace(NULL, 1, 0); #endif - printCrashReport(); + /* If this was a recursive assertion, it what most likely generated + * from printCrashReport. */ + if (new_report) printCrashReport(); } // remove the signal handler so on abort() we will output the crash report. - removeSignalHandlers(); + removeSigSegvHandlers(); bugReportEnd(0, 0); } @@ -1116,7 +1114,7 @@ void serverLogObjectDebugInfo(const robj *o) { } else if (o->type == OBJ_SET) { serverLog(LL_WARNING,"Set size: %d", (int) setTypeSize(o)); } else if (o->type == OBJ_HASH) { - serverLog(LL_WARNING,"Hash size: %d", (int) hashTypeLength(o)); + serverLog(LL_WARNING,"Hash size: %d", (int) hashTypeLength(o, 0)); } else if (o->type == OBJ_ZSET) { serverLog(LL_WARNING,"Sorted set size: %d", (int) zsetLength(o)); if (o->encoding == OBJ_ENCODING_SKIPLIST) @@ -1139,6 +1137,7 @@ void _serverAssertWithInfo(const client *c, const robj *o, const char *estr, con _serverAssert(estr,file,line); } +__attribute__ ((noinline)) void _serverPanic(const char *file, int line, const char *msg, ...) { va_list ap; va_start(ap,msg); @@ -1146,31 +1145,37 @@ void _serverPanic(const char *file, int line, const char *msg, ...) { vsnprintf(fmtmsg,sizeof(fmtmsg),msg,ap); va_end(ap); - bugReportStart(); + int new_report = bugReportStart(); serverLog(LL_WARNING,"------------------------------------------------"); serverLog(LL_WARNING,"!!! Software Failure. Press left mouse button to continue"); serverLog(LL_WARNING,"Guru Meditation: %s #%s:%d",fmtmsg,file,line); if (server.crashlog_enabled) { #ifdef HAVE_BACKTRACE - logStackTrace(NULL, 1); + logStackTrace(NULL, 1, 0); #endif - printCrashReport(); + /* If this was a recursive panic, it what most likely generated + * from printCrashReport. */ + if (new_report) printCrashReport(); } // remove the signal handler so on abort() we will output the crash report. - removeSignalHandlers(); + removeSigSegvHandlers(); bugReportEnd(0, 0); } -void bugReportStart(void) { +/* Start a bug report, returning 1 if this is the first time this function was called, 0 otherwise. */ +int bugReportStart(void) { pthread_mutex_lock(&bug_report_start_mutex); if (bug_report_start == 0) { serverLogRaw(LL_WARNING|LL_RAW, "\n\n=== REDIS BUG REPORT START: Cut & paste starting from here ===\n"); bug_report_start = 1; + pthread_mutex_unlock(&bug_report_start_mutex); + return 1; } pthread_mutex_unlock(&bug_report_start_mutex); + return 0; } #ifdef HAVE_BACKTRACE @@ -1190,7 +1195,7 @@ static void* getAndSetMcontextEip(ucontext_t *uc, void *eip) { } \ return old_val; \ } while(0) -#if defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) +#if defined(__APPLE__) && !defined(MAC_OS_10_6_DETECTED) /* OSX < 10.6 */ #if defined(__x86_64__) GET_SET_RETURN(uc->uc_mcontext->__ss.__rip, eip); @@ -1199,7 +1204,7 @@ static void* getAndSetMcontextEip(ucontext_t *uc, void *eip) { #else GET_SET_RETURN(uc->uc_mcontext->__ss.__srr0, eip); #endif -#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6) +#elif defined(__APPLE__) && defined(MAC_OS_10_6_DETECTED) /* OSX >= 10.6 */ #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__) GET_SET_RETURN(uc->uc_mcontext->__ss.__rip, eip); @@ -1290,7 +1295,7 @@ void logRegisters(ucontext_t *uc) { } while(0) /* OSX */ -#if defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6) +#if defined(__APPLE__) && defined(MAC_OS_10_6_DETECTED) /* OSX AMD64 */ #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__) serverLog(LL_WARNING, @@ -1815,24 +1820,132 @@ void closeDirectLogFiledes(int fd) { if (!log_to_stdout) close(fd); } +#if defined(HAVE_BACKTRACE) && defined(__linux__) +static int stacktrace_pipe[2] = {0}; +static void setupStacktracePipe(void) { + if (-1 == anetPipe(stacktrace_pipe, O_CLOEXEC | O_NONBLOCK, O_CLOEXEC | O_NONBLOCK)) { + serverLog(LL_WARNING, "setupStacktracePipe failed: %s", strerror(errno)); + } +} +#else +static void setupStacktracePipe(void) {/* we don't need a pipe to write the stacktraces */} +#endif #ifdef HAVE_BACKTRACE +#define BACKTRACE_MAX_SIZE 100 + +#ifdef __linux__ +#if !defined(_GNU_SOURCE) +#define _GNU_SOURCE +#endif +#include +#include +#include +#include + +#define TIDS_MAX_SIZE 50 +static size_t get_ready_to_signal_threads_tids(int sig_num, pid_t tids[TIDS_MAX_SIZE]); + +typedef struct { + char thread_name[16]; + int trace_size; + pid_t tid; + void *trace[BACKTRACE_MAX_SIZE]; +} stacktrace_data; + +__attribute__ ((noinline)) static void collect_stacktrace_data(void) { + stacktrace_data trace_data = {{0}}; + + /* Get the stack trace first! */ + trace_data.trace_size = backtrace(trace_data.trace, BACKTRACE_MAX_SIZE); + + /* get the thread name */ + prctl(PR_GET_NAME, trace_data.thread_name); + + /* get the thread id */ + trace_data.tid = syscall(SYS_gettid); + + /* Send the output to the main process*/ + if (write(stacktrace_pipe[1], &trace_data, sizeof(trace_data)) == -1) {/* Avoid warning. */}; +} + +__attribute__ ((noinline)) +static void writeStacktraces(int fd, int uplevel) { + /* get the list of all the process's threads that don't block or ignore the THREADS_SIGNAL */ + pid_t tids[TIDS_MAX_SIZE]; + size_t len_tids = get_ready_to_signal_threads_tids(THREADS_SIGNAL, tids); + if (!len_tids) { + serverLogRawFromHandler(LL_WARNING, "writeStacktraces(): Failed to get the process's threads."); + } + + char buff[PIPE_BUF]; + /* Clear the stacktraces pipe */ + while (read(stacktrace_pipe[0], &buff, sizeof(buff)) > 0) {} + + /* ThreadsManager_runOnThreads returns 0 if it is already running */ + if (!ThreadsManager_runOnThreads(tids, len_tids, collect_stacktrace_data)) return; + + size_t collected = 0; + + pid_t calling_tid = syscall(SYS_gettid); + + /* Read the stacktrace_pipe until it's empty */ + stacktrace_data curr_stacktrace_data = {{0}}; + while (read(stacktrace_pipe[0], &curr_stacktrace_data, sizeof(curr_stacktrace_data)) > 0) { + /* stacktrace header includes the tid and the thread's name */ + snprintf_async_signal_safe(buff, sizeof(buff), "\n%d %s", curr_stacktrace_data.tid, curr_stacktrace_data.thread_name); + if (write(fd,buff,strlen(buff)) == -1) {/* Avoid warning. */}; + + /* skip kernel call to the signal handler, the signal handler and the callback addresses */ + int curr_uplevel = 3; + + if (curr_stacktrace_data.tid == calling_tid) { + /* skip signal syscall and ThreadsManager_runOnThreads */ + curr_uplevel += uplevel + 2; + /* Add an indication to header of the thread that is handling the log file */ + if (write(fd," *\n",strlen(" *\n")) == -1) {/* Avoid warning. */}; + } else { + /* just add a new line */ + if (write(fd,"\n",strlen("\n")) == -1) {/* Avoid warning. */}; + } + + /* add the stacktrace */ + backtrace_symbols_fd(curr_stacktrace_data.trace+curr_uplevel, curr_stacktrace_data.trace_size-curr_uplevel, fd); + + ++collected; + } + + snprintf_async_signal_safe(buff, sizeof(buff), "\n%lu/%lu expected stacktraces.\n", (long unsigned)(collected), (long unsigned)len_tids); + if (write(fd,buff,strlen(buff)) == -1) {/* Avoid warning. */}; + +} + +#endif /* __linux__ */ +__attribute__ ((noinline)) +static void writeCurrentThreadsStackTrace(int fd, int uplevel) { + void *trace[BACKTRACE_MAX_SIZE]; + + int trace_size = backtrace(trace, BACKTRACE_MAX_SIZE); + + char *msg = "\nBacktrace:\n"; + if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */}; + backtrace_symbols_fd(trace+uplevel, trace_size-uplevel, fd); +} /* Logs the stack trace using the backtrace() call. This function is designed * to be called from signal handlers safely. * The eip argument is optional (can take NULL). * The uplevel argument indicates how many of the calling functions to skip. + * Functions that are taken in consideration in "uplevel" should be declared with + * __attribute__ ((noinline)) to make sure the compiler won't inline them. */ -void logStackTrace(void *eip, int uplevel) { - void *trace[100]; - int trace_size = 0, fd = openDirectLogFiledes(); +__attribute__ ((noinline)) +void logStackTrace(void *eip, int uplevel, int current_thread) { + int fd = openDirectLogFiledes(); char *msg; uplevel++; /* skip this function */ if (fd == -1) return; /* If we can't log there is anything to do. */ - /* Get the stack trace first! */ - trace_size = backtrace(trace, 100); - msg = "\n------ STACK TRACE ------\n"; if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */}; @@ -1844,9 +1957,21 @@ void logStackTrace(void *eip, int uplevel) { } /* Write symbols to log file */ - msg = "\nBacktrace:\n"; + ++uplevel; +#ifdef __linux__ + if (current_thread) { + writeCurrentThreadsStackTrace(fd, uplevel); + } else { + writeStacktraces(fd, uplevel); + } +#else + /* Outside of linux, we only support writing the current thread. */ + UNUSED(current_thread); + writeCurrentThreadsStackTrace(fd, uplevel); +#endif + msg = "\n------ STACK TRACE DONE ------\n"; if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */}; - backtrace_symbols_fd(trace+uplevel, trace_size-uplevel, fd); + /* Cleanup */ closeDirectLogFiledes(fd); @@ -1855,11 +1980,17 @@ void logStackTrace(void *eip, int uplevel) { #endif /* HAVE_BACKTRACE */ sds genClusterDebugString(sds infostring) { + sds cluster_info = genClusterInfoString(); + sds cluster_nodes = clusterGenNodesDescription(NULL, 0, 0); + infostring = sdscatprintf(infostring, "\r\n# Cluster info\r\n"); - infostring = sdscatsds(infostring, genClusterInfoString()); + infostring = sdscatsds(infostring, cluster_info); infostring = sdscatprintf(infostring, "\n------ CLUSTER NODES OUTPUT ------\n"); - infostring = sdscatsds(infostring, clusterGenNodesDescription(NULL, 0, 0)); - + infostring = sdscatsds(infostring, cluster_nodes); + + sdsfree(cluster_info); + sdsfree(cluster_nodes); + return infostring; } @@ -1936,7 +2067,7 @@ void logCurrentClient(client *cc, const char *title) { dictEntry *de; key = getDecodedObject(cc->argv[1]); - de = dictFind(cc->db->dict, key->ptr); + de = dbFind(cc->db, key->ptr); if (de) { val = dictGetVal(de); serverLog(LL_WARNING,"key '%s' found in DB containing the following object:", (char*)key->ptr); @@ -1961,7 +2092,7 @@ int memtest_test_linux_anonymous_maps(void) { int regions = 0, j; int fd = openDirectLogFiledes(); - if (!fd) return 0; + if (fd == -1) return 0; fp = fopen("/proc/self/maps","r"); if (!fp) { @@ -2116,9 +2247,19 @@ void invalidFunctionWasCalled(void) {} typedef void (*invalidFunctionWasCalledType)(void); -void sigsegvHandler(int sig, siginfo_t *info, void *secret) { +__attribute__ ((noinline)) +static void sigsegvHandler(int sig, siginfo_t *info, void *secret) { UNUSED(secret); UNUSED(info); + int print_full_crash_info = 1; + /* Check if it is safe to enter the signal handler. second thread crashing at the same time will deadlock. */ + if(pthread_mutex_lock(&signal_handler_lock) == EDEADLK) { + /* If this thread already owns the lock (meaning we crashed during handling a signal) switch + * to printing the minimal information about the crash. */ + serverLogRawFromHandler(LL_WARNING, + "Crashed running signal handler. Providing reduced version of recursive crash report."); + print_full_crash_info = 0; + } bugReportStart(); serverLog(LL_WARNING, @@ -2151,7 +2292,9 @@ void sigsegvHandler(int sig, siginfo_t *info, void *secret) { getAndSetMcontextEip(uc, ptr); } - logStackTrace(eip, 1); + /* When printing the reduced crash info, just print the current thread + * to avoid race conditions with the multi-threaded stack collector. */ + logStackTrace(eip, 1, !print_full_crash_info); if (eip == info->si_addr) { /* Restore old eip */ @@ -2161,7 +2304,7 @@ void sigsegvHandler(int sig, siginfo_t *info, void *secret) { logRegisters(uc); #endif - printCrashReport(); + if (print_full_crash_info) printCrashReport(); #ifdef HAVE_BACKTRACE if (eip != NULL) @@ -2171,6 +2314,60 @@ void sigsegvHandler(int sig, siginfo_t *info, void *secret) { bugReportEnd(1, sig); } +void setupDebugSigHandlers(void) { + setupStacktracePipe(); + + setupSigSegvHandler(); + + struct sigaction act; + + sigemptyset(&act.sa_mask); + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = sigalrmSignalHandler; + sigaction(SIGALRM, &act, NULL); +} + +void setupSigSegvHandler(void) { + /* Initialize the signal handler lock. + Attempting to initialize an already initialized mutex or mutexattr results in undefined behavior. */ + if (!signal_handler_lock_initialized) { + /* Set signal handler with error checking attribute. re-lock within the same thread will error. */ + pthread_mutexattr_init(&signal_handler_lock_attr); + pthread_mutexattr_settype(&signal_handler_lock_attr, PTHREAD_MUTEX_ERRORCHECK); + pthread_mutex_init(&signal_handler_lock, &signal_handler_lock_attr); + signal_handler_lock_initialized = 1; + } + + struct sigaction act; + + sigemptyset(&act.sa_mask); + /* SA_NODEFER to disables adding the signal to the signal mask of the + * calling process on entry to the signal handler unless it is included in the sa_mask field. */ + /* SA_SIGINFO flag is set to raise the function defined in sa_sigaction. + * Otherwise, sa_handler is used. */ + act.sa_flags = SA_NODEFER | SA_SIGINFO; + act.sa_sigaction = sigsegvHandler; + if(server.crashlog_enabled) { + sigaction(SIGSEGV, &act, NULL); + sigaction(SIGBUS, &act, NULL); + sigaction(SIGFPE, &act, NULL); + sigaction(SIGILL, &act, NULL); + sigaction(SIGABRT, &act, NULL); + } +} + +void removeSigSegvHandlers(void) { + struct sigaction act; + sigemptyset(&act.sa_mask); + act.sa_flags = SA_NODEFER | SA_RESETHAND; + act.sa_handler = SIG_DFL; + sigaction(SIGSEGV, &act, NULL); + sigaction(SIGBUS, &act, NULL); + sigaction(SIGFPE, &act, NULL); + sigaction(SIGILL, &act, NULL); + sigaction(SIGABRT, &act, NULL); +} + void printCrashReport(void) { /* Log INFO and CLIENT LIST */ logServerInfo(); @@ -2193,7 +2390,7 @@ void printCrashReport(void) { void bugReportEnd(int killViaSignal, int sig) { struct sigaction act; - serverLogRaw(LL_WARNING|LL_RAW, + serverLogRawFromHandler(LL_WARNING|LL_RAW, "\n=== REDIS BUG REPORT END. Make sure to include from START to END. ===\n\n" " Please report the crash by opening an issue on github:\n\n" " http://github.com/redis/redis/issues\n\n" @@ -2206,7 +2403,7 @@ void bugReportEnd(int killViaSignal, int sig) { if (server.daemonize && server.supervised == 0 && server.pidfile) unlink(server.pidfile); if (!killViaSignal) { - /* To avoid issues with valgrind, we may wanna exit rahter than generate a signal */ + /* To avoid issues with valgrind, we may wanna exit rather than generate a signal */ if (server.use_exit_on_panic) { /* Using _exit to bypass false leak reports by gcc ASAN */ fflush(stdout); @@ -2218,7 +2415,7 @@ void bugReportEnd(int killViaSignal, int sig) { /* Make sure we exit with the right signal at the end. So for instance * the core will be dumped if enabled. */ sigemptyset (&act.sa_mask); - act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND; + act.sa_flags = 0; act.sa_handler = SIG_DFL; sigaction (sig, &act, NULL); kill(getpid(),sig); @@ -2251,22 +2448,27 @@ void serverLogHexDump(int level, char *descr, void *value, size_t len) { /* =========================== Software Watchdog ============================ */ #include -void watchdogSignalHandler(int sig, siginfo_t *info, void *secret) { +void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret) { #ifdef HAVE_BACKTRACE ucontext_t *uc = (ucontext_t*) secret; #else (void)secret; #endif - UNUSED(info); UNUSED(sig); - serverLogFromHandler(LL_WARNING,"\n--- WATCHDOG TIMER EXPIRED ---"); + /* SIGALRM can be sent explicitly to the process calling kill() to get the stacktraces, + or every watchdog_period interval. In the last case, si_pid is not set */ + if(info->si_pid == 0) { + serverLogRawFromHandler(LL_WARNING,"\n--- WATCHDOG TIMER EXPIRED ---"); + } else { + serverLogRawFromHandler(LL_WARNING, "\nReceived SIGALRM"); + } #ifdef HAVE_BACKTRACE - logStackTrace(getAndSetMcontextEip(uc, NULL), 1); + logStackTrace(getAndSetMcontextEip(uc, NULL), 1, 0); #else - serverLogFromHandler(LL_WARNING,"Sorry: no support for backtrace()."); + serverLogRawFromHandler(LL_WARNING,"Sorry: no support for backtrace()."); #endif - serverLogFromHandler(LL_WARNING,"--------\n"); + serverLogRawFromHandler(LL_WARNING,"--------\n"); } /* Schedule a SIGALRM delivery after the specified period in milliseconds. @@ -2284,25 +2486,10 @@ void watchdogScheduleSignal(int period) { setitimer(ITIMER_REAL, &it, NULL); } void applyWatchdogPeriod(void) { - struct sigaction act; - /* Disable watchdog when period is 0 */ if (server.watchdog_period == 0) { watchdogScheduleSignal(0); /* Stop the current timer. */ - - /* Set the signal handler to SIG_IGN, this will also remove pending - * signals from the queue. */ - sigemptyset(&act.sa_mask); - act.sa_flags = 0; - act.sa_handler = SIG_IGN; - sigaction(SIGALRM, &act, NULL); } else { - /* Setup the signal handler. */ - sigemptyset(&act.sa_mask); - act.sa_flags = SA_SIGINFO; - act.sa_sigaction = watchdogSignalHandler; - sigaction(SIGALRM, &act, NULL); - /* If the configured period is smaller than twice the timer period, it is * too short for the software watchdog to work reliably. Fix it now * if needed. */ @@ -2320,3 +2507,145 @@ void debugDelay(int usec) { if (usec < 0) usec = (rand() % -usec) == 0 ? 1: 0; if (usec) usleep(usec); } + +#ifdef HAVE_BACKTRACE +#ifdef __linux__ + +/* =========================== Stacktrace Utils ============================ */ + + + +/** If it doesn't block and doesn't ignore, return 1 (the thread will handle the signal) + * If thread tid blocks or ignores sig_num returns 0 (thread is not ready to catch the signal). + * also returns 0 if something is wrong and prints a warning message to the log file **/ +static int is_thread_ready_to_signal(const char *proc_pid_task_path, const char *tid, int sig_num) { + /* Open the threads status file path /proc/>/task//status */ + char path_buff[PATH_MAX]; + snprintf_async_signal_safe(path_buff, PATH_MAX, "%s/%s/status", proc_pid_task_path, tid); + + int thread_status_file = open(path_buff, O_RDONLY); + char buff[PATH_MAX]; + if (thread_status_file == -1) { + serverLogFromHandler(LL_WARNING, "tid:%s: failed to open %s file", tid, path_buff); + return 0; + } + + int ret = 1; + size_t field_name_len = strlen("SigBlk:\t"); /* SigIgn has the same length */ + char *line = NULL; + size_t fields_count = 2; + while ((line = fgets_async_signal_safe(buff, PATH_MAX, thread_status_file)) && fields_count) { + /* iterate the file until we reach SigBlk or SigIgn field line */ + if (!strncmp(buff, "SigBlk:\t", field_name_len) || !strncmp(buff, "SigIgn:\t", field_name_len)) { + line = buff + field_name_len; + unsigned long sig_mask; + if (-1 == string2ul_base16_async_signal_safe(line, sizeof(buff), &sig_mask)) { + serverLogRawFromHandler(LL_WARNING, "Can't convert signal mask to an unsigned long due to an overflow"); + ret = 0; + break; + } + + /* The bit position in a signal mask aligns with the signal number. Since signal numbers start from 1 + we need to adjust the signal number by subtracting 1 to align it correctly with the zero-based indexing used */ + if (sig_mask & (1L << (sig_num - 1))) { /* if the signal is blocked/ignored return 0 */ + ret = 0; + break; + } + --fields_count; + } + } + + close(thread_status_file); + + /* if we reached EOF, it means we haven't found SigBlk or/and SigIgn, something is wrong */ + if (line == NULL) { + ret = 0; + serverLogFromHandler(LL_WARNING, "tid:%s: failed to find SigBlk or/and SigIgn field(s) in %s/%s/status file", tid, proc_pid_task_path, tid); + } + return ret; +} + +/** We are using syscall(SYS_getdents64) to read directories, which unlike opendir(), is considered + * async-signal-safe. This function wrapper getdents64() in glibc is supported as of glibc 2.30. + * To support earlier versions of glibc, we use syscall(SYS_getdents64), which requires defining + * linux_dirent64 ourselves. This structure is very old and stable: It will not change unless the kernel + * chooses to break compatibility with all existing binaries. Highly Unlikely. +*/ +struct linux_dirent64 { + unsigned long long d_ino; + long long d_off; + unsigned short d_reclen; /* Length of this linux_dirent */ + unsigned char d_type; + char d_name[256]; /* Filename (null-terminated) */ +}; + +/** Returns the number of the process's threads that can receive signal sig_num. + * Writes into tids the tids of these threads. + * If it fails, returns 0. +*/ +static size_t get_ready_to_signal_threads_tids(int sig_num, pid_t tids[TIDS_MAX_SIZE]) { + /* Open /proc//task file. */ + char path_buff[PATH_MAX]; + snprintf_async_signal_safe(path_buff, PATH_MAX, "/proc/%d/task", getpid()); + + int dir; + if (-1 == (dir = open(path_buff, O_RDONLY | O_DIRECTORY))) return 0; + + size_t tids_count = 0; + pid_t calling_tid = syscall(SYS_gettid); + int current_thread_index = -1; + long nread; + char buff[PATH_MAX]; + + /* readdir() is not async-signal-safe (AS-safe). + Hence, we read the file using SYS_getdents64, which is considered AS-sync*/ + while ((nread = syscall(SYS_getdents64, dir, buff, PATH_MAX))) { + if (nread == -1) { + close(dir); + serverLogRawFromHandler(LL_WARNING, "get_ready_to_signal_threads_tids(): Failed to read the process's task directory"); + return 0; + } + /* Each thread is represented by a directory */ + for (long pos = 0; pos < nread;) { + struct linux_dirent64 *entry = (struct linux_dirent64 *)(buff + pos); + pos += entry->d_reclen; + /* Skip irrelevant directories. */ + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) continue; + + /* the thread's directory name is equivalent to its tid. */ + long tid; + string2l(entry->d_name, strlen(entry->d_name), &tid); + + if(!is_thread_ready_to_signal(path_buff, entry->d_name, sig_num)) continue; + + if(tid == calling_tid) { + current_thread_index = tids_count; + } + + /* save the thread id */ + tids[tids_count++] = tid; + + /* Stop if we reached the maximum threads number. */ + if(tids_count == TIDS_MAX_SIZE) { + serverLogRawFromHandler(LL_WARNING, "get_ready_to_signal_threads_tids(): Reached the limit of the tids buffer."); + break; + } + } + + if(tids_count == TIDS_MAX_SIZE) break; + } + + /* Swap the last tid with the the current thread id */ + if(current_thread_index != -1) { + pid_t last_tid = tids[tids_count - 1]; + + tids[tids_count - 1] = calling_tid; + tids[current_thread_index] = last_tid; + } + + close(dir); + + return tids_count; +} +#endif /* __linux__ */ +#endif /* HAVE_BACKTRACE */ diff --git a/src/debugmacro.h b/src/debugmacro.h index dcd79a33f85..e94b0c0e3bf 100644 --- a/src/debugmacro.h +++ b/src/debugmacro.h @@ -2,32 +2,11 @@ * * ----------------------------------------------------------------------------- * - * Copyright (c) 2016, Salvatore Sanfilippo + * Copyright (c) 2016-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef _REDIS_DEBUGMACRO_H_ diff --git a/src/defrag.c b/src/defrag.c index ff63cf8fdec..78de7224867 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -5,42 +5,28 @@ * We do that by scanning the keyspace and for each pointer we have, we can try to * ask the allocator if moving it to a new address will help reduce fragmentation. * - * Copyright (c) 2020, Redis Labs, Inc + * Copyright (c) 2020-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" -#include "cluster.h" -#include -#include #include #ifdef HAVE_DEFRAG +typedef struct defragCtx { + void *privdata; + int slot; +} defragCtx; + +typedef struct defragPubSubCtx { + kvstore *pubsub_channels; + dict *(*clientPubSubChannels)(client*); +} defragPubSubCtx; + /* this method was added to jemalloc in order to help us understand which * pointers are worthwhile moving and which aren't */ int je_get_defrag_hint(void* ptr); @@ -60,7 +46,7 @@ void* activeDefragAlloc(void *ptr) { /* move this allocation to a new allocation. * make sure not to use the thread cache. so that we don't get back the same * pointers we try to free */ - size = zmalloc_size(ptr); + size = zmalloc_usable_size(ptr); newptr = zmalloc_no_tcache(size); memcpy(newptr, ptr, size); zfree_no_tcache(ptr); @@ -84,14 +70,32 @@ sds activeDefragSds(sds sdsptr) { return NULL; } -/* Defrag helper for robj and/or string objects +/* Defrag helper for hfield strings * * returns NULL in case the allocation wasn't moved. * when it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ -robj *activeDefragStringOb(robj* ob) { +hfield activeDefragHfield(hfield hf) { + void *ptr = hfieldGetAllocPtr(hf); + void *newptr = activeDefragAlloc(ptr); + if (newptr) { + size_t offset = hf - (char*)ptr; + hf = (char*)newptr + offset; + return hf; + } + return NULL; +} + +/* Defrag helper for robj and/or string objects with expected refcount. + * + * Like activeDefragStringOb, but it requires the caller to pass in the expected + * reference count. In some cases, the caller needs to update a robj whose + * reference count is not 1, in these cases, the caller must explicitly pass + * in the reference count, otherwise defragmentation will not be performed. + * Note that the caller is responsible for updating any other references to the robj. */ +robj *activeDefragStringObEx(robj* ob, int expected_refcount) { robj *ret = NULL; - if (ob->refcount!=1) + if (ob->refcount!=expected_refcount) return NULL; /* try to defrag robj (only if not an EMBSTR type (handled below). */ @@ -122,6 +126,15 @@ robj *activeDefragStringOb(robj* ob) { return ret; } +/* Defrag helper for robj and/or string objects + * + * returns NULL in case the allocation wasn't moved. + * when it returns a non-null value, the old pointer was already released + * and should NOT be accessed. */ +robj *activeDefragStringOb(robj* ob) { + return activeDefragStringObEx(ob, 1); +} + /* Defrag helper for lua scripts * * returns NULL in case the allocation wasn't moved. @@ -143,11 +156,20 @@ luaScript *activeDefragLuaScript(luaScript *script) { } /* Defrag helper for dict main allocations (dict struct, and hash tables). - * receives a pointer to the dict* and implicitly updates it when the dict - * struct itself was moved. Returns a stat of how many pointers were moved. */ -void dictDefragTables(dict* d) { + * Receives a pointer to the dict* and return a new dict* when the dict + * struct itself was moved. + * + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released + * and should NOT be accessed. */ +dict *dictDefragTables(dict *d) { + dict *ret = NULL; dictEntry **newtable; + /* handle the dict struct */ + if ((ret = activeDefragAlloc(d))) + d = ret; /* handle the first hash table */ + if (!d->ht_table[0]) return ret; /* created but unused */ newtable = activeDefragAlloc(d->ht_table[0]); if (newtable) d->ht_table[0] = newtable; @@ -157,6 +179,7 @@ void dictDefragTables(dict* d) { if (newtable) d->ht_table[1] = newtable; } + return ret; } /* Internal function used by zslDefrag */ @@ -243,6 +266,31 @@ void activeDefragSdsDictCallback(void *privdata, const dictEntry *de) { UNUSED(de); } +void activeDefragHfieldDictCallback(void *privdata, const dictEntry *de) { + dict *d = privdata; + hfield newhf, hf = dictGetKey(de); + + if (hfieldGetExpireTime(hf) == EB_EXPIRE_TIME_INVALID) { + /* If the hfield does not have TTL, we directly defrag it. */ + newhf = activeDefragHfield(hf); + } else { + /* Update its reference in the ebucket while defragging it. */ + ebuckets *eb = hashTypeGetDictMetaHFE(d); + newhf = ebDefragItem(eb, &hashFieldExpireBucketsType, hf, (ebDefragFunction *)activeDefragHfield); + } + if (newhf) { + /* We can't search in dict for that key after we've released + * the pointer it holds, since it won't be able to do the string + * compare, but we can find the entry using key hash and pointer. */ + dictUseStoredKeyApi(d, 1); + uint64_t hash = dictGetHash(d, newhf); + dictUseStoredKeyApi(d, 0); + dictEntry *de = dictFindEntryByPtrAndHash(d, hf, hash); + serverAssert(de); + dictSetKey(d, de, newhf); + } +} + /* Defrag a dict with sds key and optional value (either ptr, sds or robj string) */ void activeDefragSdsDict(dict* d, int val_type) { unsigned long cursor = 0; @@ -261,6 +309,20 @@ void activeDefragSdsDict(dict* d, int val_type) { } while (cursor != 0); } +/* Defrag a dict with hfield key and sds value. */ +void activeDefragHfieldDict(dict *d) { + unsigned long cursor = 0; + dictDefragFunctions defragfns = { + .defragAlloc = activeDefragAlloc, + .defragKey = NULL, /* Will be defragmented in activeDefragHfieldDictCallback. */ + .defragVal = (dictDefragAllocFunction *)activeDefragSds + }; + do { + cursor = dictScanDefrag(d, cursor, activeDefragHfieldDictCallback, + &defragfns, d); + } while (cursor != 0); +} + /* Defrag a list of ptr, sds or robj string values */ void activeDefragList(list *l, int val_type) { listNode *ln, *newln; @@ -415,10 +477,10 @@ void scanLaterHash(robj *ob, unsigned long *cursor) { dict *d = ob->ptr; dictDefragFunctions defragfns = { .defragAlloc = activeDefragAlloc, - .defragKey = (dictDefragAllocFunction *)activeDefragSds, + .defragKey = NULL, /* Will be defragmented in activeDefragHfieldDictCallback. */ .defragVal = (dictDefragAllocFunction *)activeDefragSds }; - *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); + *cursor = dictScanDefrag(d, *cursor, activeDefragHfieldDictCallback, &defragfns, d); } void defragQuicklist(redisDb *db, dictEntry *kde) { @@ -457,11 +519,9 @@ void defragZsetSkiplist(redisDb *db, dictEntry *kde) { } dictReleaseIterator(di); } - /* handle the dict struct */ - if ((newdict = activeDefragAlloc(zs->dict))) + /* defrag the dict struct and tables */ + if ((newdict = dictDefragTables(zs->dict))) zs->dict = newdict; - /* defrag the dict tables */ - dictDefragTables(zs->dict); } void defragHash(redisDb *db, dictEntry *kde) { @@ -472,12 +532,10 @@ void defragHash(redisDb *db, dictEntry *kde) { if (dictSize(d) > server.active_defrag_max_scan_fields) defragLater(db, kde); else - activeDefragSdsDict(d, DEFRAG_SDS_DICT_VAL_IS_SDS); - /* handle the dict struct */ - if ((newd = activeDefragAlloc(ob->ptr))) + activeDefragHfieldDict(d); + /* defrag the dict struct and tables */ + if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd; - /* defrag the dict tables */ - dictDefragTables(ob->ptr); } void defragSet(redisDb *db, dictEntry *kde) { @@ -489,11 +547,9 @@ void defragSet(redisDb *db, dictEntry *kde) { defragLater(db, kde); else activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL); - /* handle the dict struct */ - if ((newd = activeDefragAlloc(ob->ptr))) + /* defrag the dict struct and tables */ + if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd; - /* defrag the dict tables */ - dictDefragTables(ob->ptr); } /* Defrag callback for radix tree iterator, called for each node, @@ -668,32 +724,43 @@ void defragModule(redisDb *db, dictEntry *kde) { } /* for each key we scan in the main dict, this function will attempt to defrag - * all the various pointers it has. Returns a stat of how many pointers were - * moved. */ -void defragKey(redisDb *db, dictEntry *de) { + * all the various pointers it has. */ +void defragKey(defragCtx *ctx, dictEntry *de) { sds keysds = dictGetKey(de); - robj *newob, *ob; + robj *newob, *ob = dictGetVal(de); unsigned char *newzl; sds newsds; - + redisDb *db = ctx->privdata; + int slot = ctx->slot; /* Try to defrag the key name. */ newsds = activeDefragSds(keysds); if (newsds) { - dictSetKey(db->dict, de, newsds); - if (dictSize(db->expires)) { + kvstoreDictSetKey(db->keys, slot, de, newsds); + if (kvstoreDictSize(db->expires, slot)) { /* We can't search in db->expires for that key after we've released * the pointer it holds, since it won't be able to do the string * compare, but we can find the entry using key hash and pointer. */ - uint64_t hash = dictGetHash(db->dict, newsds); - dictEntry *expire_de = dictFindEntryByPtrAndHash(db->expires, keysds, hash); - if (expire_de) dictSetKey(db->expires, expire_de, newsds); + uint64_t hash = kvstoreGetHash(db->expires, newsds); + dictEntry *expire_de = kvstoreDictFindEntryByPtrAndHash(db->expires, slot, keysds, hash); + if (expire_de) kvstoreDictSetKey(db->expires, slot, expire_de, newsds); } + + /* Update the key's reference in the dict's metadata or the listpackEx. */ + if (unlikely(ob->type == OBJ_HASH)) + hashTypeUpdateKeyRef(ob, newsds); } /* Try to defrag robj and / or string value. */ - ob = dictGetVal(de); - if ((newob = activeDefragStringOb(ob))) { - dictSetVal(db->dict, de, newob); + if (unlikely(ob->type == OBJ_HASH && hashTypeGetMinExpire(ob, 0) != EB_EXPIRE_TIME_INVALID)) { + /* Update its reference in the ebucket while defragging it. */ + newob = ebDefragItem(&db->hexpires, &hashExpireBucketsType, ob, + (ebDefragFunction *)activeDefragStringOb); + } else { + /* If the dict doesn't have metadata, we directly defrag it. */ + newob = activeDefragStringOb(ob); + } + if (newob) { + kvstoreDictSetVal(db->keys, slot, de, newob); ob = newob; } @@ -733,6 +800,12 @@ void defragKey(redisDb *db, dictEntry *de) { if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; + } else if (ob->encoding == OBJ_ENCODING_LISTPACK_EX) { + listpackEx *newlpt, *lpt = (listpackEx*)ob->ptr; + if ((newlpt = activeDefragAlloc(lpt))) + ob->ptr = lpt = newlpt; + if ((newzl = activeDefragAlloc(lpt->lp))) + lpt->lp = newzl; } else if (ob->encoding == OBJ_ENCODING_HT) { defragHash(db, de); } else { @@ -750,7 +823,7 @@ void defragKey(redisDb *db, dictEntry *de) { /* Defrag scan callback for the main db dictionary. */ void defragScanCallback(void *privdata, const dictEntry *de) { long long hits_before = server.stat_active_defrag_hits; - defragKey((redisDb*)privdata, (dictEntry*)de); + defragKey((defragCtx*)privdata, (dictEntry*)de); if (server.stat_active_defrag_hits != hits_before) server.stat_active_defrag_key_hits++; else @@ -765,20 +838,68 @@ void defragScanCallback(void *privdata, const dictEntry *de) { * or not, a false detection can cause the defragmenter to waste a lot of CPU * without the possibility of getting any results. */ float getAllocatorFragmentation(size_t *out_frag_bytes) { - size_t resident, active, allocated; - zmalloc_get_allocator_info(&allocated, &active, &resident); - float frag_pct = ((float)active / allocated)*100 - 100; - size_t frag_bytes = active - allocated; + size_t resident, active, allocated, frag_smallbins_bytes; + zmalloc_get_allocator_info(1, &allocated, &active, &resident, NULL, NULL, &frag_smallbins_bytes); + + if (server.lua_arena != UINT_MAX) { + size_t lua_resident, lua_active, lua_allocated, lua_frag_smallbins_bytes; + zmalloc_get_allocator_info_by_arena(server.lua_arena, 0, &lua_allocated, &lua_active, &lua_resident, &lua_frag_smallbins_bytes); + resident -= lua_resident; + active -= lua_active; + allocated -= lua_allocated; + frag_smallbins_bytes -= lua_frag_smallbins_bytes; + } + + /* Calculate the fragmentation ratio as the proportion of wasted memory in small + * bins (which are defraggable) relative to the total allocated memory (including large bins). + * This is because otherwise, if most of the memory usage is large bins, we may show high percentage, + * despite the fact it's not a lot of memory for the user. */ + float frag_pct = (float)frag_smallbins_bytes / allocated * 100; float rss_pct = ((float)resident / allocated)*100 - 100; size_t rss_bytes = resident - allocated; if(out_frag_bytes) - *out_frag_bytes = frag_bytes; + *out_frag_bytes = frag_smallbins_bytes; serverLog(LL_DEBUG, - "allocated=%zu, active=%zu, resident=%zu, frag=%.0f%% (%.0f%% rss), frag_bytes=%zu (%zu rss)", - allocated, active, resident, frag_pct, rss_pct, frag_bytes, rss_bytes); + "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)", + allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes); return frag_pct; } +/* Defrag scan callback for the pubsub dictionary. */ +void defragPubsubScanCallback(void *privdata, const dictEntry *de) { + defragCtx *ctx = privdata; + defragPubSubCtx *pubsub_ctx = ctx->privdata; + kvstore *pubsub_channels = pubsub_ctx->pubsub_channels; + robj *newchannel, *channel = dictGetKey(de); + dict *newclients, *clients = dictGetVal(de); + + /* Try to defrag the channel name. */ + serverAssert(channel->refcount == (int)dictSize(clients) + 1); + newchannel = activeDefragStringObEx(channel, dictSize(clients) + 1); + if (newchannel) { + kvstoreDictSetKey(pubsub_channels, ctx->slot, (dictEntry*)de, newchannel); + + /* The channel name is shared by the client's pubsub(shard) and server's + * pubsub(shard), after defraging the channel name, we need to update + * the reference in the clients' dictionary. */ + dictIterator *di = dictGetIterator(clients); + dictEntry *clientde; + while((clientde = dictNext(di)) != NULL) { + client *c = dictGetKey(clientde); + dictEntry *pubsub_channel = dictFind(pubsub_ctx->clientPubSubChannels(c), newchannel); + serverAssert(pubsub_channel); + dictSetKey(pubsub_ctx->clientPubSubChannels(c), pubsub_channel, newchannel); + } + dictReleaseIterator(di); + } + + /* Try to defrag the dictionary of clients that is stored as the value part. */ + if ((newclients = dictDefragTables(clients))) + kvstoreDictSetVal(pubsub_channels, ctx->slot, (dictEntry*)de, newclients); + + server.stat_active_defrag_scanned++; +} + /* We may need to defrag other globals, one small allocation can hold a full allocator run. * so although small, it is still important to defrag these */ void defragOtherGlobals(void) { @@ -788,6 +909,8 @@ void defragOtherGlobals(void) { * that remain static for a long time */ activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT); moduleDefragGlobals(); + kvstoreDictLUTDefrag(server.pubsub_channels, dictDefragTables); + kvstoreDictLUTDefrag(server.pubsubshard_channels, dictDefragTables); } /* returns 0 more work may or may not be needed (see non-zero cursor), @@ -821,7 +944,7 @@ static sds defrag_later_current_key = NULL; static unsigned long defrag_later_cursor = 0; /* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */ -int defragLaterStep(redisDb *db, long long endtime) { +int defragLaterStep(redisDb *db, int slot, long long endtime) { unsigned int iterations = 0; unsigned long long prev_defragged = server.stat_active_defrag_hits; unsigned long long prev_scanned = server.stat_active_defrag_scanned; @@ -851,7 +974,7 @@ int defragLaterStep(redisDb *db, long long endtime) { } /* each time we enter this function we need to fetch the key from the dict again (if it still exists) */ - dictEntry *de = dictFind(db->dict, defrag_later_current_key); + dictEntry *de = kvstoreDictFind(db->keys, slot, defrag_later_current_key); key_defragged = server.stat_active_defrag_hits; do { int quit = 0; @@ -896,7 +1019,8 @@ void computeDefragCycles(void) { return; } - /* Calculate the adaptive aggressiveness of the defrag */ + /* Calculate the adaptive aggressiveness of the defrag based on the current + * fragmentation and configurations. */ int cpu_pct = INTERPOLATE(frag_pct, server.active_defrag_threshold_lower, server.active_defrag_threshold_upper, @@ -905,10 +1029,15 @@ void computeDefragCycles(void) { cpu_pct = LIMIT(cpu_pct, server.active_defrag_cycle_min, server.active_defrag_cycle_max); - /* We allow increasing the aggressiveness during a scan, but don't - * reduce it. */ - if (cpu_pct > server.active_defrag_running) { + + /* Normally we allow increasing the aggressiveness during a scan, but don't + * reduce it, since we should not lower the aggressiveness when fragmentation + * drops. But when a configuration is made, we should reconsider it. */ + if (cpu_pct > server.active_defrag_running || + server.active_defrag_configuration_changed) + { server.active_defrag_running = cpu_pct; + server.active_defrag_configuration_changed = 0; serverLog(LL_VERBOSE, "Starting active defrag, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%", frag_pct, frag_bytes, cpu_pct); @@ -919,9 +1048,11 @@ void computeDefragCycles(void) { * This works in a similar way to activeExpireCycle, in the sense that * we do incremental work across calls. */ void activeDefragCycle(void) { + static int slot = -1; static int current_db = -1; - static unsigned long cursor = 0; - static unsigned long expires_cursor = 0; + static int defrag_later_item_in_progress = 0; + static int defrag_stage = 0; + static unsigned long defrag_cursor = 0; static redisDb *db = NULL; static long long start_scan, start_stat; unsigned int iterations = 0; @@ -929,18 +1060,23 @@ void activeDefragCycle(void) { unsigned long long prev_scanned = server.stat_active_defrag_scanned; long long start, timelimit, endtime; mstime_t latency; + int all_stages_finished = 0; int quit = 0; if (!server.active_defrag_enabled) { if (server.active_defrag_running) { /* if active defrag was disabled mid-run, start from fresh next time. */ server.active_defrag_running = 0; + server.active_defrag_configuration_changed = 0; if (db) listEmpty(db->defrag_later); defrag_later_current_key = NULL; defrag_later_cursor = 0; current_db = -1; - cursor = 0; + defrag_stage = 0; + defrag_cursor = 0; + slot = -1; + defrag_later_item_in_progress = 0; db = NULL; goto update_metrics; } @@ -955,6 +1091,14 @@ void activeDefragCycle(void) { run_with_period(1000) { computeDefragCycles(); } + + /* Normally it is checked once a second, but when there is a configuration + * change, we want to check it as soon as possible. */ + if (server.active_defrag_configuration_changed) { + computeDefragCycles(); + server.active_defrag_configuration_changed = 0; + } + if (!server.active_defrag_running) return; @@ -968,9 +1112,9 @@ void activeDefragCycle(void) { dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc}; do { /* if we're not continuing a scan from the last call or loop, start a new one */ - if (!cursor && !expires_cursor) { + if (!defrag_stage && !defrag_cursor && (slot < 0)) { /* finish any leftovers from previous db before moving to the next one */ - if (db && defragLaterStep(db, endtime)) { + if (db && defragLaterStep(db, slot, endtime)) { quit = 1; /* time is up, we didn't finish all the work */ break; /* this will exit the function and we'll continue on the next cycle */ } @@ -989,7 +1133,10 @@ void activeDefragCycle(void) { start_scan = now; current_db = -1; - cursor = 0; + defrag_stage = 0; + defrag_cursor = 0; + slot = -1; + defrag_later_item_in_progress = 0; db = NULL; server.active_defrag_running = 0; @@ -1005,38 +1152,79 @@ void activeDefragCycle(void) { } db = &server.db[current_db]; - cursor = 0; + kvstoreDictLUTDefrag(db->keys, dictDefragTables); + kvstoreDictLUTDefrag(db->expires, dictDefragTables); + defrag_stage = 0; + defrag_cursor = 0; + slot = -1; + defrag_later_item_in_progress = 0; } + /* This array of structures holds the parameters for all defragmentation stages. */ + typedef struct defragStage { + kvstore *kvs; + dictScanFunction *scanfn; + void *privdata; + } defragStage; + defragStage defrag_stages[] = { + {db->keys, defragScanCallback, db}, + {db->expires, scanCallbackCountScanned, NULL}, + {server.pubsub_channels, defragPubsubScanCallback, + &(defragPubSubCtx){server.pubsub_channels, getClientPubSubChannels}}, + {server.pubsubshard_channels, defragPubsubScanCallback, + &(defragPubSubCtx){server.pubsubshard_channels, getClientPubSubShardChannels}}, + }; do { + int num_stages = sizeof(defrag_stages) / sizeof(defrag_stages[0]); + serverAssert(defrag_stage < num_stages); + defragStage *current_stage = &defrag_stages[defrag_stage]; + /* before scanning the next bucket, see if we have big keys left from the previous bucket to scan */ - if (defragLaterStep(db, endtime)) { + if (defragLaterStep(db, slot, endtime)) { quit = 1; /* time is up, we didn't finish all the work */ break; /* this will exit the function and we'll continue on the next cycle */ } - /* Scan the keyspace dict unless we're scanning the expire dict. */ - if (!expires_cursor) - cursor = dictScanDefrag(db->dict, cursor, defragScanCallback, - &defragfns, db); + if (!defrag_later_item_in_progress) { + /* Continue defragmentation from the previous stage. + * If slot is -1, it means this stage starts from the first non-empty slot. */ + if (slot == -1) slot = kvstoreGetFirstNonEmptyDictIndex(current_stage->kvs); + defrag_cursor = kvstoreDictScanDefrag(current_stage->kvs, slot, defrag_cursor, + current_stage->scanfn, &defragfns, &(defragCtx){current_stage->privdata, slot}); + } + + if (!defrag_cursor) { + /* Move to the next slot only if regular and large item scanning has been completed. */ + if (listLength(db->defrag_later) > 0) { + defrag_later_item_in_progress = 1; + continue; + } - /* When done scanning the keyspace dict, we scan the expire dict. */ - if (!cursor) - expires_cursor = dictScanDefrag(db->expires, expires_cursor, - scanCallbackCountScanned, - &defragfns, NULL); + /* Move to the next slot in the current stage. If we've reached the end, move to the next stage. */ + if ((slot = kvstoreGetNextNonEmptyDictIndex(current_stage->kvs, slot)) == -1) + defrag_stage++; + defrag_later_item_in_progress = 0; + } + /* Check if all defragmentation stages have been processed. + * If so, mark as finished and reset the stage counter to move on to next database. */ + if (defrag_stage == num_stages) { + all_stages_finished = 1; + defrag_stage = 0; + } + /* Once in 16 scan iterations, 512 pointer reallocations. or 64 keys * (if we have a lot of pointers in one hash bucket or rehashing), * check if we reached the time limit. * But regardless, don't start a new db in this loop, this is because after * the last db we call defragOtherGlobals, which must be done in one cycle */ - if (!(cursor || expires_cursor) || + if (all_stages_finished || ++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || server.stat_active_defrag_scanned - prev_scanned > 64) { - if (!cursor || ustime() > endtime) { + /* Quit if all stages were finished or timeout. */ + if (all_stages_finished || ustime() > endtime) { quit = 1; break; } @@ -1044,7 +1232,7 @@ void activeDefragCycle(void) { prev_defragged = server.stat_active_defrag_hits; prev_scanned = server.stat_active_defrag_scanned; } - } while((cursor || expires_cursor) && !quit); + } while(!all_stages_finished && !quit); } while(!quit); latencyEndMonitor(latency); diff --git a/src/dict.c b/src/dict.c index 6760da1540d..2928d8af5bd 100644 --- a/src/dict.c +++ b/src/dict.c @@ -5,32 +5,11 @@ * tables of power of two in size are used, collisions are handled by * chaining. See the source code for more information... :) * - * Copyright (c) 2006-2012, Salvatore Sanfilippo + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "fmacros.h" @@ -46,20 +25,23 @@ #include "dict.h" #include "zmalloc.h" #include "redisassert.h" +#include "monotonic.h" -/* Using dictEnableResize() / dictDisableResize() we make possible to disable +/* Using dictSetResizeEnabled() we make possible to disable * resizing and rehashing of the hash table as needed. This is very important * for Redis, as we use copy-on-write and don't want to move too much memory * around when there is a child performing saving operations. * * Note that even when dict_can_resize is set to DICT_RESIZE_AVOID, not all - * resizes are prevented: a hash table is still allowed to grow if the ratio - * between the number of elements and the buckets > dict_force_resize_ratio. */ + * resizes are prevented: + * - A hash table is still allowed to expand if the ratio between the number + * of elements and the buckets >= dict_force_resize_ratio. + * - A hash table is still allowed to shrink if the ratio between the number + * of elements and the buckets <= 1 / (HASHTABLE_MIN_FILL * dict_force_resize_ratio). */ static dictResizeEnable dict_can_resize = DICT_RESIZE_ENABLE; -static unsigned int dict_force_resize_ratio = 5; +static unsigned int dict_force_resize_ratio = 4; /* -------------------------- types ----------------------------------------- */ - struct dictEntry { void *key; union { @@ -69,9 +51,6 @@ struct dictEntry { double d; } v; struct dictEntry *next; /* Next entry in the same hash bucket. */ - void *metadata[]; /* An arbitrary number of bytes (starting at a - * pointer-aligned address) of size as returned - * by dictType's dictEntryMetadataBytes(). */ }; typedef struct { @@ -81,12 +60,32 @@ typedef struct { /* -------------------------- private prototypes ---------------------------- */ -static int _dictExpandIfNeeded(dict *d); +static void _dictExpandIfNeeded(dict *d); +static void _dictShrinkIfNeeded(dict *d); static signed char _dictNextExp(unsigned long size); static int _dictInit(dict *d, dictType *type); static dictEntry *dictGetNext(const dictEntry *de); static dictEntry **dictGetNextRef(dictEntry *de); static void dictSetNext(dictEntry *de, dictEntry *next); +static int dictDefaultCompare(dict *d, const void *key1, const void *key2); + +/* -------------------------- misc inline functions -------------------------------- */ + +typedef int (*keyCmpFunc)(dict *d, const void *key1, const void *key2); +static inline keyCmpFunc dictGetKeyCmpFunc(dict *d) { + if (d->useStoredKeyApi && d->type->storedKeyCompare) + return d->type->storedKeyCompare; + if (d->type->keyCompare) + return d->type->keyCompare; + return dictDefaultCompare; +} + +static inline uint64_t dictHashKey(dict *d, const void *key, int isStoredKey) { + if (isStoredKey && d->type->storedHashFunction) + return d->type->storedHashFunction(key); + else + return d->type->hashFunction(key); +} /* -------------------------- hash functions -------------------------------- */ @@ -184,16 +183,28 @@ static void _dictReset(dict *d, int htidx) /* Create a new hash table */ dict *dictCreate(dictType *type) { - size_t metasize = type->dictMetadataBytes ? type->dictMetadataBytes() : 0; - dict *d = zmalloc(sizeof(*d) + metasize); - if (metasize) { + size_t metasize = type->dictMetadataBytes ? type->dictMetadataBytes(NULL) : 0; + dict *d = zmalloc(sizeof(*d)+metasize); + if (metasize > 0) { memset(dictMetadata(d), 0, metasize); } - _dictInit(d,type); return d; } +/* Change dictType of dict to another one with metadata support + * Rest of dictType's values must stay the same */ +void dictTypeAddMeta(dict **d, dictType *typeWithMeta) { + /* Verify new dictType is compatible with the old one */ + dictType toCmp = *typeWithMeta; + toCmp.dictMetadataBytes = NULL; /* Expected old one not to have metadata */ + toCmp.onDictRelease = (*d)->type->onDictRelease; /* Ignore 'onDictRelease' in comparison */ + assert(memcmp((*d)->type, &toCmp, sizeof(dictType)) == 0); /* The rest of the dictType fields must be the same */ + + *d = zrealloc(*d, sizeof(dict) + typeWithMeta->dictMetadataBytes(*d)); + (*d)->type = typeWithMeta; +} + /* Initialize the hash table */ int _dictInit(dict *d, dictType *type) { @@ -202,33 +213,20 @@ int _dictInit(dict *d, dictType *type) d->type = type; d->rehashidx = -1; d->pauserehash = 0; + d->pauseAutoResize = 0; + d->useStoredKeyApi = 0; return DICT_OK; } -/* Resize the table to the minimal size that contains all the elements, - * but with the invariant of a USED/BUCKETS ratio near to <= 1 */ -int dictResize(dict *d) -{ - unsigned long minimal; - - if (dict_can_resize != DICT_RESIZE_ENABLE || dictIsRehashing(d)) return DICT_ERR; - minimal = d->ht_used[0]; - if (minimal < DICT_HT_INITIAL_SIZE) - minimal = DICT_HT_INITIAL_SIZE; - return dictExpand(d, minimal); -} - -/* Expand or create the hash table, +/* Resize or create the hash table, * when malloc_failed is non-NULL, it'll avoid panic if malloc fails (in which case it'll be set to 1). - * Returns DICT_OK if expand was performed, and DICT_ERR if skipped. */ -int _dictExpand(dict *d, unsigned long size, int* malloc_failed) + * Returns DICT_OK if resize was performed, and DICT_ERR if skipped. */ +int _dictResize(dict *d, unsigned long size, int* malloc_failed) { if (malloc_failed) *malloc_failed = 0; - /* the size is invalid if it is smaller than the number of - * elements already inside the hash table */ - if (dictIsRehashing(d) || d->ht_used[0] > size) - return DICT_ERR; + /* We can't rehash twice if rehashing is ongoing. */ + assert(!dictIsRehashing(d)); /* the new hash table */ dictEntry **new_ht_table; @@ -236,7 +234,7 @@ int _dictExpand(dict *d, unsigned long size, int* malloc_failed) signed char new_ht_size_exp = _dictNextExp(size); /* Detect overflows */ - size_t newsize = 1ul<ht_table[0] == NULL) { + /* Prepare a second hash table for incremental rehashing. + * We do this even for the first initialization, so that we can trigger the + * rehashingStarted more conveniently, we will clean it up right after. */ + d->ht_size_exp[1] = new_ht_size_exp; + d->ht_used[1] = new_ht_used; + d->ht_table[1] = new_ht_table; + d->rehashidx = 0; + if (d->type->rehashingStarted) d->type->rehashingStarted(d); + + /* Is this the first initialization or is the first hash table empty? If so + * it's not really a rehashing, we can just set the first hash table so that + * it can accept keys. */ + if (d->ht_table[0] == NULL || d->ht_used[0] == 0) { + if (d->type->rehashingCompleted) d->type->rehashingCompleted(d); + if (d->ht_table[0]) zfree(d->ht_table[0]); d->ht_size_exp[0] = new_ht_size_exp; d->ht_used[0] = new_ht_used; d->ht_table[0] = new_ht_table; + _dictReset(d, 1); + d->rehashidx = -1; return DICT_OK; } - /* Prepare a second hash table for incremental rehashing */ - d->ht_size_exp[1] = new_ht_size_exp; - d->ht_used[1] = new_ht_used; - d->ht_table[1] = new_ht_table; - d->rehashidx = 0; return DICT_OK; } +int _dictExpand(dict *d, unsigned long size, int* malloc_failed) { + /* the size is invalid if it is smaller than the size of the hash table + * or smaller than the number of elements already inside the hash table */ + if (dictIsRehashing(d) || d->ht_used[0] > size || DICTHT_SIZE(d->ht_size_exp[0]) >= size) + return DICT_ERR; + return _dictResize(d, size, malloc_failed); +} + /* return DICT_ERR if expand was not performed */ int dictExpand(dict *d, unsigned long size) { return _dictExpand(d, size, NULL); @@ -278,11 +293,86 @@ int dictExpand(dict *d, unsigned long size) { /* return DICT_ERR if expand failed due to memory allocation failure */ int dictTryExpand(dict *d, unsigned long size) { - int malloc_failed; + int malloc_failed = 0; _dictExpand(d, size, &malloc_failed); return malloc_failed? DICT_ERR : DICT_OK; } +/* return DICT_ERR if shrink was not performed */ +int dictShrink(dict *d, unsigned long size) { + /* the size is invalid if it is bigger than the size of the hash table + * or smaller than the number of elements already inside the hash table */ + if (dictIsRehashing(d) || d->ht_used[0] > size || DICTHT_SIZE(d->ht_size_exp[0]) <= size) + return DICT_ERR; + return _dictResize(d, size, NULL); +} + +/* Helper function for `dictRehash` and `dictBucketRehash` which rehashes all the keys + * in a bucket at index `idx` from the old to the new hash HT. */ +static void rehashEntriesInBucketAtIndex(dict *d, uint64_t idx) { + dictEntry *de = d->ht_table[0][idx]; + uint64_t h; + dictEntry *nextde; + while (de) { + nextde = dictGetNext(de); + void *key = dictGetKey(de); + /* Get the index in the new hash table */ + if (d->ht_size_exp[1] > d->ht_size_exp[0]) { + h = dictHashKey(d, key, 1) & DICTHT_SIZE_MASK(d->ht_size_exp[1]); + } else { + /* We're shrinking the table. The tables sizes are powers of + * two, so we simply mask the bucket index in the larger table + * to get the bucket index in the smaller table. */ + h = idx & DICTHT_SIZE_MASK(d->ht_size_exp[1]); + } + if (d->type->no_value) { + if (d->type->keys_are_odd && !d->ht_table[1][h]) { + /* Destination bucket is empty and we can store the key + * directly without an allocated entry. Free the old entry + * if it's an allocated entry. + * + * TODO: Add a flag 'keys_are_even' and if set, we can use + * this optimization for these dicts too. We can set the LSB + * bit when stored as a dict entry and clear it again when + * we need the key back. */ + assert(entryIsKey(key)); + if (!entryIsKey(de)) zfree(decodeMaskedPtr(de)); + de = key; + } else if (entryIsKey(de)) { + /* We don't have an allocated entry but we need one. */ + de = createEntryNoValue(key, d->ht_table[1][h]); + } else { + /* Just move the existing entry to the destination table and + * update the 'next' field. */ + assert(entryIsNoValue(de)); + dictSetNext(de, d->ht_table[1][h]); + } + } else { + dictSetNext(de, d->ht_table[1][h]); + } + d->ht_table[1][h] = de; + d->ht_used[0]--; + d->ht_used[1]++; + de = nextde; + } + d->ht_table[0][idx] = NULL; +} + +/* This checks if we already rehashed the whole table and if more rehashing is required */ +static int dictCheckRehashingCompleted(dict *d) { + if (d->ht_used[0] != 0) return 0; + + if (d->type->rehashingCompleted) d->type->rehashingCompleted(d); + zfree(d->ht_table[0]); + /* Copy the new ht onto the old one */ + d->ht_table[0] = d->ht_table[1]; + d->ht_used[0] = d->ht_used[1]; + d->ht_size_exp[0] = d->ht_size_exp[1]; + _dictReset(d, 1); + d->rehashidx = -1; + return 1; +} + /* Performs N steps of incremental rehashing. Returns 1 if there are still * keys to move from the old to the new hash table, otherwise 0 is returned. * @@ -297,16 +387,17 @@ int dictRehash(dict *d, int n) { unsigned long s0 = DICTHT_SIZE(d->ht_size_exp[0]); unsigned long s1 = DICTHT_SIZE(d->ht_size_exp[1]); if (dict_can_resize == DICT_RESIZE_FORBID || !dictIsRehashing(d)) return 0; + /* If dict_can_resize is DICT_RESIZE_AVOID, we want to avoid rehashing. + * - If expanding, the threshold is dict_force_resize_ratio which is 4. + * - If shrinking, the threshold is 1 / (HASHTABLE_MIN_FILL * dict_force_resize_ratio) which is 1/32. */ if (dict_can_resize == DICT_RESIZE_AVOID && - ((s1 > s0 && s1 / s0 < dict_force_resize_ratio) || - (s1 < s0 && s0 / s1 < dict_force_resize_ratio))) + ((s1 > s0 && s1 < dict_force_resize_ratio * s0) || + (s1 < s0 && s0 < HASHTABLE_MIN_FILL * dict_force_resize_ratio * s1))) { return 0; } while(n-- && d->ht_used[0] != 0) { - dictEntry *de, *nextde; - /* Note that rehashidx can't overflow as we are sure there are more * elements because ht[0].used != 0 */ assert(DICTHT_SIZE(d->ht_size_exp[0]) > (unsigned long)d->rehashidx); @@ -314,70 +405,12 @@ int dictRehash(dict *d, int n) { d->rehashidx++; if (--empty_visits == 0) return 1; } - de = d->ht_table[0][d->rehashidx]; /* Move all the keys in this bucket from the old to the new hash HT */ - while(de) { - uint64_t h; - - nextde = dictGetNext(de); - void *key = dictGetKey(de); - /* Get the index in the new hash table */ - if (d->ht_size_exp[1] > d->ht_size_exp[0]) { - h = dictHashKey(d, key) & DICTHT_SIZE_MASK(d->ht_size_exp[1]); - } else { - /* We're shrinking the table. The tables sizes are powers of - * two, so we simply mask the bucket index in the larger table - * to get the bucket index in the smaller table. */ - h = d->rehashidx & DICTHT_SIZE_MASK(d->ht_size_exp[1]); - } - if (d->type->no_value) { - if (d->type->keys_are_odd && !d->ht_table[1][h]) { - /* Destination bucket is empty and we can store the key - * directly without an allocated entry. Free the old entry - * if it's an allocated entry. - * - * TODO: Add a flag 'keys_are_even' and if set, we can use - * this optimization for these dicts too. We can set the LSB - * bit when stored as a dict entry and clear it again when - * we need the key back. */ - assert(entryIsKey(key)); - if (!entryIsKey(de)) zfree(decodeMaskedPtr(de)); - de = key; - } else if (entryIsKey(de)) { - /* We don't have an allocated entry but we need one. */ - de = createEntryNoValue(key, d->ht_table[1][h]); - } else { - /* Just move the existing entry to the destination table and - * update the 'next' field. */ - assert(entryIsNoValue(de)); - dictSetNext(de, d->ht_table[1][h]); - } - } else { - dictSetNext(de, d->ht_table[1][h]); - } - d->ht_table[1][h] = de; - d->ht_used[0]--; - d->ht_used[1]++; - de = nextde; - } - d->ht_table[0][d->rehashidx] = NULL; + rehashEntriesInBucketAtIndex(d, d->rehashidx); d->rehashidx++; } - /* Check if we already rehashed the whole table... */ - if (d->ht_used[0] == 0) { - zfree(d->ht_table[0]); - /* Copy the new ht onto the old one */ - d->ht_table[0] = d->ht_table[1]; - d->ht_used[0] = d->ht_used[1]; - d->ht_size_exp[0] = d->ht_size_exp[1]; - _dictReset(d, 1); - d->rehashidx = -1; - return 0; - } - - /* More to rehash... */ - return 1; + return !dictCheckRehashingCompleted(d); } long long timeInMilliseconds(void) { @@ -387,18 +420,19 @@ long long timeInMilliseconds(void) { return (((long long)tv.tv_sec)*1000)+(tv.tv_usec/1000); } -/* Rehash in ms+"delta" milliseconds. The value of "delta" is larger - * than 0, and is smaller than 1 in most cases. The exact upper bound +/* Rehash in us+"delta" microseconds. The value of "delta" is larger + * than 0, and is smaller than 1000 in most cases. The exact upper bound * depends on the running time of dictRehash(d,100).*/ -int dictRehashMilliseconds(dict *d, int ms) { +int dictRehashMicroseconds(dict *d, uint64_t us) { if (d->pauserehash > 0) return 0; - long long start = timeInMilliseconds(); + monotime timer; + elapsedStart(&timer); int rehashes = 0; while(dictRehash(d,100)) { rehashes += 100; - if (timeInMilliseconds()-start > ms) break; + if (elapsedUs(timer) >= us) break; } return rehashes; } @@ -415,9 +449,24 @@ static void _dictRehashStep(dict *d) { if (d->pauserehash == 0) dictRehash(d,1); } -/* Return a pointer to the metadata section within the dict. */ -void *dictMetadata(dict *d) { - return &d->metadata; +/* Performs rehashing on a single bucket. */ +int _dictBucketRehash(dict *d, uint64_t idx) { + if (d->pauserehash != 0) return 0; + unsigned long s0 = DICTHT_SIZE(d->ht_size_exp[0]); + unsigned long s1 = DICTHT_SIZE(d->ht_size_exp[1]); + if (dict_can_resize == DICT_RESIZE_FORBID || !dictIsRehashing(d)) return 0; + /* If dict_can_resize is DICT_RESIZE_AVOID, we want to avoid rehashing. + * - If expanding, the threshold is dict_force_resize_ratio which is 4. + * - If shrinking, the threshold is 1 / (HASHTABLE_MIN_FILL * dict_force_resize_ratio) which is 1/32. */ + if (dict_can_resize == DICT_RESIZE_AVOID && + ((s1 > s0 && s1 < dict_force_resize_ratio * s0) || + (s1 < s0 && s0 < HASHTABLE_MIN_FILL * dict_force_resize_ratio * s1))) + { + return 0; + } + rehashEntriesInBucketAtIndex(d, idx); + dictCheckRehashingCompleted(d); + return 1; } /* Add an element to the target hash table */ @@ -472,9 +521,7 @@ dictEntry *dictInsertAtPosition(dict *d, void *key, void *position) { int htidx = dictIsRehashing(d) ? 1 : 0; assert(bucket >= &d->ht_table[htidx][0] && bucket <= &d->ht_table[htidx][DICTHT_SIZE_MASK(d->ht_size_exp[htidx])]); - size_t metasize = dictEntryMetadataSize(d); if (d->type->no_value) { - assert(!metasize); /* Entry metadata + no value not supported. */ if (d->type->keys_are_odd && !*bucket) { /* We can store the key directly in the destination bucket without the * allocated entry. @@ -494,11 +541,8 @@ dictEntry *dictInsertAtPosition(dict *d, void *key, void *position) { * Insert the element in top, with the assumption that in a database * system it is more likely that recently added entries are accessed * more frequently. */ - entry = zmalloc(sizeof(*entry) + metasize); + entry = zmalloc(sizeof(*entry)); assert(entryIsNormal(entry)); /* Check alignment of allocation */ - if (metasize > 0) { - memset(dictEntryMetadata(entry), 0, metasize); - } entry->key = key; entry->next = *bucket; } @@ -561,16 +605,31 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { /* dict is empty */ if (dictSize(d) == 0) return NULL; - if (dictIsRehashing(d)) _dictRehashStep(d); - h = dictHashKey(d, key); + h = dictHashKey(d, key, d->useStoredKeyApi); + idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[0]); + + if (dictIsRehashing(d)) { + if ((long)idx >= d->rehashidx && d->ht_table[0][idx]) { + /* If we have a valid hash entry at `idx` in ht0, we perform + * rehash on the bucket at `idx` (being more CPU cache friendly) */ + _dictBucketRehash(d, idx); + } else { + /* If the hash entry is not in ht0, we rehash the buckets based + * on the rehashidx (not CPU cache friendly). */ + _dictRehashStep(d); + } + } + + keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d); for (table = 0; table <= 1; table++) { + if (table == 0 && (long)idx < d->rehashidx) continue; idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]); he = d->ht_table[table][idx]; prevHe = NULL; while(he) { void *he_key = dictGetKey(he); - if (key == he_key || dictCompareKeys(d, key, he_key)) { + if (key == he_key || cmpFunc(d, key, he_key)) { /* Unlink the element from the list */ if (prevHe) dictSetNext(prevHe, dictGetNext(he)); @@ -580,6 +639,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { dictFreeUnlinkedEntry(d, he); } d->ht_used[table]--; + _dictShrinkIfNeeded(d); return he; } prevHe = he; @@ -660,6 +720,14 @@ int _dictClear(dict *d, int htidx, void(callback)(dict*)) { /* Clear & Release the hash table */ void dictRelease(dict *d) { + /* Someone may be monitoring a dict that started rehashing, before + * destroying the dict fake completion. */ + if (dictIsRehashing(d) && d->type->rehashingCompleted) + d->type->rehashingCompleted(d); + + if (d->type->onDictRelease) + d->type->onDictRelease(d); + _dictClear(d,0,NULL); _dictClear(d,1,NULL); zfree(d); @@ -671,14 +739,30 @@ dictEntry *dictFind(dict *d, const void *key) uint64_t h, idx, table; if (dictSize(d) == 0) return NULL; /* dict is empty */ - if (dictIsRehashing(d)) _dictRehashStep(d); - h = dictHashKey(d, key); + + h = dictHashKey(d, key, d->useStoredKeyApi); + idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[0]); + keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d); + + if (dictIsRehashing(d)) { + if ((long)idx >= d->rehashidx && d->ht_table[0][idx]) { + /* If we have a valid hash entry at `idx` in ht0, we perform + * rehash on the bucket at `idx` (being more CPU cache friendly) */ + _dictBucketRehash(d, idx); + } else { + /* If the hash entry is not in ht0, we rehash the buckets based + * on the rehashidx (not CPU cache friendly). */ + _dictRehashStep(d); + } + } + for (table = 0; table <= 1; table++) { + if (table == 0 && (long)idx < d->rehashidx) continue; idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]); he = d->ht_table[table][idx]; while(he) { void *he_key = dictGetKey(he); - if (key == he_key || dictCompareKeys(d, key, he_key)) + if (key == he_key || cmpFunc(d, key, he_key)) return he; he = dictGetNext(he); } @@ -715,14 +799,17 @@ dictEntry *dictTwoPhaseUnlinkFind(dict *d, const void *key, dictEntry ***plink, if (dictSize(d) == 0) return NULL; /* dict is empty */ if (dictIsRehashing(d)) _dictRehashStep(d); - h = dictHashKey(d, key); + + h = dictHashKey(d, key, d->useStoredKeyApi); + keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d); for (table = 0; table <= 1; table++) { idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]); + if (table == 0 && (long)idx < d->rehashidx) continue; dictEntry **ref = &d->ht_table[table][idx]; while (ref && *ref) { void *de_key = dictGetKey(*ref); - if (key == de_key || dictCompareKeys(d, key, de_key)) { + if (key == de_key || cmpFunc(d, key, de_key)) { *table_index = table; *plink = ref; dictPauseRehashing(d); @@ -742,6 +829,7 @@ void dictTwoPhaseUnlinkFree(dict *d, dictEntry *he, dictEntry **plink, int table dictFreeKey(d, he); dictFreeVal(d, he); if (!entryIsKey(he)) zfree(decodeMaskedPtr(he)); + _dictShrinkIfNeeded(d); dictResumeRehashing(d); } @@ -788,12 +876,6 @@ double dictIncrDoubleVal(dictEntry *de, double val) { return de->v.d += val; } -/* A pointer to the metadata section within the dict entry. */ -void *dictEntryMetadata(dictEntry *de) { - assert(entryHasValue(de)); - return &de->metadata; -} - void *dictGetKey(const dictEntry *de) { if (entryIsKey(de)) return (void*)de; if (entryIsNoValue(de)) return decodeEntryNoValue(de)->key; @@ -856,7 +938,7 @@ static void dictSetNext(dictEntry *de, dictEntry *next) { * and values. */ size_t dictMemUsage(const dict *d) { return dictSize(d) * sizeof(dictEntry) + - dictSlots(d) * sizeof(dictEntry*); + dictBuckets(d) * sizeof(dictEntry*); } size_t dictEntryMemUsage(void) { @@ -950,6 +1032,11 @@ dictEntry *dictNext(dictIterator *iter) dictPauseRehashing(iter->d); else iter->fingerprint = dictFingerprint(iter->d); + + /* skip the rehashed slots in table[0] */ + if (dictIsRehashing(iter->d)) { + iter->index = iter->d->rehashidx - 1; + } } iter->index++; if (iter->index >= (long) DICTHT_SIZE(iter->d->ht_size_exp[iter->table])) { @@ -995,7 +1082,7 @@ dictEntry *dictGetRandomKey(dict *d) do { /* We are sure there are no elements in indexes from 0 * to rehashidx-1 */ - h = d->rehashidx + (randomULong() % (dictSlots(d) - d->rehashidx)); + h = d->rehashidx + (randomULong() % (dictBuckets(d) - d->rehashidx)); he = (h >= s0) ? d->ht_table[1][h - s0] : d->ht_table[0][h]; } while(he == NULL); } else { @@ -1127,7 +1214,7 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { /* Reallocate the dictEntry, key and value allocations in a bucket using the * provided allocation functions in order to defrag them. */ -static void dictDefragBucket(dict *d, dictEntry **bucketref, dictDefragFunctions *defragfns) { +static void dictDefragBucket(dictEntry **bucketref, dictDefragFunctions *defragfns) { dictDefragAllocFunction *defragalloc = defragfns->defragAlloc; dictDefragAllocFunction *defragkey = defragfns->defragKey; dictDefragAllocFunction *defragval = defragfns->defragVal; @@ -1154,8 +1241,6 @@ static void dictDefragBucket(dict *d, dictEntry **bucketref, dictDefragFunctions } if (newde) { *bucketref = newde; - if (d->type->afterReplaceEntry) - d->type->afterReplaceEntry(d, newde); } bucketref = dictGetNextRef(*bucketref); } @@ -1318,7 +1403,7 @@ unsigned long dictScanDefrag(dict *d, /* Emit entries at cursor */ if (defragfns) { - dictDefragBucket(d, &d->ht_table[htidx0][v & m0], defragfns); + dictDefragBucket(&d->ht_table[htidx0][v & m0], defragfns); } de = d->ht_table[htidx0][v & m0]; while (de) { @@ -1351,7 +1436,7 @@ unsigned long dictScanDefrag(dict *d, /* Emit entries at cursor */ if (defragfns) { - dictDefragBucket(d, &d->ht_table[htidx0][v & m0], defragfns); + dictDefragBucket(&d->ht_table[htidx0][v & m0], defragfns); } de = d->ht_table[htidx0][v & m0]; while (de) { @@ -1365,7 +1450,7 @@ unsigned long dictScanDefrag(dict *d, do { /* Emit entries at cursor */ if (defragfns) { - dictDefragBucket(d, &d->ht_table[htidx1][v & m1], defragfns); + dictDefragBucket(&d->ht_table[htidx1][v & m1], defragfns); } de = d->ht_table[htidx1][v & m1]; while (de) { @@ -1392,52 +1477,92 @@ unsigned long dictScanDefrag(dict *d, /* ------------------------- private functions ------------------------------ */ /* Because we may need to allocate huge memory chunk at once when dict - * expands, we will check this allocation is allowed or not if the dict - * type has expandAllowed member function. */ -static int dictTypeExpandAllowed(dict *d) { - if (d->type->expandAllowed == NULL) return 1; - return d->type->expandAllowed( - DICTHT_SIZE(_dictNextExp(d->ht_used[0] + 1)) * sizeof(dictEntry*), + * resizes, we will check this allocation is allowed or not if the dict + * type has resizeAllowed member function. */ +static int dictTypeResizeAllowed(dict *d, size_t size) { + if (d->type->resizeAllowed == NULL) return 1; + return d->type->resizeAllowed( + DICTHT_SIZE(_dictNextExp(size)) * sizeof(dictEntry*), (double)d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0])); } -/* Expand the hash table if needed */ -static int _dictExpandIfNeeded(dict *d) -{ +/* Returning DICT_OK indicates a successful expand or the dictionary is undergoing rehashing, + * and there is nothing else we need to do about this dictionary currently. While DICT_ERR indicates + * that expand has not been triggered (may be try shrinking?)*/ +int dictExpandIfNeeded(dict *d) { /* Incremental rehashing already in progress. Return. */ if (dictIsRehashing(d)) return DICT_OK; /* If the hash table is empty expand it to the initial size. */ - if (DICTHT_SIZE(d->ht_size_exp[0]) == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE); + if (DICTHT_SIZE(d->ht_size_exp[0]) == 0) { + dictExpand(d, DICT_HT_INITIAL_SIZE); + return DICT_OK; + } /* If we reached the 1:1 ratio, and we are allowed to resize the hash * table (global setting) or we should avoid it but the ratio between * elements/buckets is over the "safe" threshold, we resize doubling * the number of buckets. */ - if (!dictTypeExpandAllowed(d)) - return DICT_OK; if ((dict_can_resize == DICT_RESIZE_ENABLE && d->ht_used[0] >= DICTHT_SIZE(d->ht_size_exp[0])) || (dict_can_resize != DICT_RESIZE_FORBID && - d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0]) > dict_force_resize_ratio)) + d->ht_used[0] >= dict_force_resize_ratio * DICTHT_SIZE(d->ht_size_exp[0]))) { - return dictExpand(d, d->ht_used[0] + 1); + if (dictTypeResizeAllowed(d, d->ht_used[0] + 1)) + dictExpand(d, d->ht_used[0] + 1); + return DICT_OK; } - return DICT_OK; + return DICT_ERR; +} + +/* Expand the hash table if needed */ +static void _dictExpandIfNeeded(dict *d) { + /* Automatic resizing is disallowed. Return */ + if (d->pauseAutoResize > 0) return; + + dictExpandIfNeeded(d); +} + +/* Returning DICT_OK indicates a successful shrinking or the dictionary is undergoing rehashing, + * and there is nothing else we need to do about this dictionary currently. While DICT_ERR indicates + * that shrinking has not been triggered (may be try expanding?)*/ +int dictShrinkIfNeeded(dict *d) { + /* Incremental rehashing already in progress. Return. */ + if (dictIsRehashing(d)) return DICT_OK; + + /* If the size of hash table is DICT_HT_INITIAL_SIZE, don't shrink it. */ + if (DICTHT_SIZE(d->ht_size_exp[0]) <= DICT_HT_INITIAL_SIZE) return DICT_OK; + + /* If we reached below 1:8 elements/buckets ratio, and we are allowed to resize + * the hash table (global setting) or we should avoid it but the ratio is below 1:32, + * we'll trigger a resize of the hash table. */ + if ((dict_can_resize == DICT_RESIZE_ENABLE && + d->ht_used[0] * HASHTABLE_MIN_FILL <= DICTHT_SIZE(d->ht_size_exp[0])) || + (dict_can_resize != DICT_RESIZE_FORBID && + d->ht_used[0] * HASHTABLE_MIN_FILL * dict_force_resize_ratio <= DICTHT_SIZE(d->ht_size_exp[0]))) + { + if (dictTypeResizeAllowed(d, d->ht_used[0])) + dictShrink(d, d->ht_used[0]); + return DICT_OK; + } + return DICT_ERR; +} + +static void _dictShrinkIfNeeded(dict *d) +{ + /* Automatic resizing is disallowed. Return */ + if (d->pauseAutoResize > 0) return; + + dictShrinkIfNeeded(d); } -/* TODO: clz optimization */ /* Our hash table capability is a power of two */ static signed char _dictNextExp(unsigned long size) { - unsigned char e = DICT_HT_INITIAL_EXP; - + if (size <= DICT_HT_INITIAL_SIZE) return DICT_HT_INITIAL_EXP; if (size >= LONG_MAX) return (8*sizeof(long)-1); - while(1) { - if (((unsigned long)1<= size) - return e; - e++; - } + + return 8*sizeof(long) - __builtin_clzl(size-1); } /* Finds and returns the position within the dict where the provided key should @@ -1447,20 +1572,34 @@ static signed char _dictNextExp(unsigned long size) void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) { unsigned long idx, table; dictEntry *he; - uint64_t hash = dictHashKey(d, key); + uint64_t hash = dictHashKey(d, key, d->useStoredKeyApi); if (existing) *existing = NULL; - if (dictIsRehashing(d)) _dictRehashStep(d); + idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[0]); + + if (dictIsRehashing(d)) { + if ((long)idx >= d->rehashidx && d->ht_table[0][idx]) { + /* If we have a valid hash entry at `idx` in ht0, we perform + * rehash on the bucket at `idx` (being more CPU cache friendly) */ + _dictBucketRehash(d, idx); + } else { + /* If the hash entry is not in ht0, we rehash the buckets based + * on the rehashidx (not CPU cache friendly). */ + _dictRehashStep(d); + } + } /* Expand the hash table if needed */ - if (_dictExpandIfNeeded(d) == DICT_ERR) - return NULL; + _dictExpandIfNeeded(d); + keyCmpFunc cmpFunc = dictGetKeyCmpFunc(d); + for (table = 0; table <= 1; table++) { + if (table == 0 && (long)idx < d->rehashidx) continue; idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]); /* Search if this slot does not already contain the given key */ he = d->ht_table[table][idx]; while(he) { void *he_key = dictGetKey(he); - if (key == he_key || dictCompareKeys(d, key, he_key)) { + if (key == he_key || cmpFunc(d, key, he_key)) { if (existing) *existing = he; return NULL; } @@ -1476,10 +1615,15 @@ void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) } void dictEmpty(dict *d, void(callback)(dict*)) { + /* Someone may be monitoring a dict that started rehashing, before + * destroying the dict fake completion. */ + if (dictIsRehashing(d) && d->type->rehashingCompleted) + d->type->rehashingCompleted(d); _dictClear(d,0,callback); _dictClear(d,1,callback); d->rehashidx = -1; d->pauserehash = 0; + d->pauseAutoResize = 0; } void dictSetResizeEnabled(dictResizeEnable enable) { @@ -1487,7 +1631,7 @@ void dictSetResizeEnabled(dictResizeEnable enable) { } uint64_t dictGetHash(dict *d, const void *key) { - return dictHashKey(d, key); + return dictHashKey(d, key, d->useStoredKeyApi); } /* Finds the dictEntry using pointer and pre-calculated hash. @@ -1502,6 +1646,7 @@ dictEntry *dictFindEntryByPtrAndHash(dict *d, const void *oldptr, uint64_t hash) if (dictSize(d) == 0) return NULL; /* dict is empty */ for (table = 0; table <= 1; table++) { idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]); + if (table == 0 && (long)idx < d->rehashidx) continue; he = d->ht_table[table][idx]; while(he) { if (oldptr == dictGetKey(he)) @@ -1513,78 +1658,97 @@ dictEntry *dictFindEntryByPtrAndHash(dict *d, const void *oldptr, uint64_t hash) return NULL; } -/* ------------------------------- Debugging ---------------------------------*/ +/* Provides the old and new ht size for a given dictionary during rehashing. This method + * should only be invoked during initialization/rehashing. */ +void dictRehashingInfo(dict *d, unsigned long long *from_size, unsigned long long *to_size) { + /* Invalid method usage if rehashing isn't ongoing. */ + assert(dictIsRehashing(d)); + *from_size = DICTHT_SIZE(d->ht_size_exp[0]); + *to_size = DICTHT_SIZE(d->ht_size_exp[1]); +} +/* ------------------------------- Debugging ---------------------------------*/ #define DICT_STATS_VECTLEN 50 -size_t _dictGetStatsHt(char *buf, size_t bufsize, dict *d, int htidx, int full) { - unsigned long i, slots = 0, chainlen, maxchainlen = 0; - unsigned long totchainlen = 0; - unsigned long clvector[DICT_STATS_VECTLEN]; - size_t l = 0; - - if (d->ht_used[htidx] == 0) { - return snprintf(buf,bufsize, - "Hash table %d stats (%s):\n" - "No stats available for empty dictionaries\n", - htidx, (htidx == 0) ? "main hash table" : "rehashing target"); - } - - if (!full) { - l += snprintf(buf+l,bufsize-l, - "Hash table %d stats (%s):\n" - " table size: %lu\n" - " number of elements: %lu\n", - htidx, (htidx == 0) ? "main hash table" : "rehashing target", - DICTHT_SIZE(d->ht_size_exp[htidx]), d->ht_used[htidx]); - - /* Make sure there is a NULL term at the end. */ - buf[bufsize-1] = '\0'; - /* Unlike snprintf(), return the number of characters actually written. */ - return strlen(buf); +void dictFreeStats(dictStats *stats) { + zfree(stats->clvector); + zfree(stats); +} + +void dictCombineStats(dictStats *from, dictStats *into) { + into->buckets += from->buckets; + into->maxChainLen = (from->maxChainLen > into->maxChainLen) ? from->maxChainLen : into->maxChainLen; + into->totalChainLen += from->totalChainLen; + into->htSize += from->htSize; + into->htUsed += from->htUsed; + for (int i = 0; i < DICT_STATS_VECTLEN; i++) { + into->clvector[i] += from->clvector[i]; } +} +dictStats *dictGetStatsHt(dict *d, int htidx, int full) { + unsigned long *clvector = zcalloc(sizeof(unsigned long) * DICT_STATS_VECTLEN); + dictStats *stats = zcalloc(sizeof(dictStats)); + stats->htidx = htidx; + stats->clvector = clvector; + stats->htSize = DICTHT_SIZE(d->ht_size_exp[htidx]); + stats->htUsed = d->ht_used[htidx]; + if (!full) return stats; /* Compute stats. */ - for (i = 0; i < DICT_STATS_VECTLEN; i++) clvector[i] = 0; - for (i = 0; i < DICTHT_SIZE(d->ht_size_exp[htidx]); i++) { + for (unsigned long i = 0; i < DICTHT_SIZE(d->ht_size_exp[htidx]); i++) { dictEntry *he; if (d->ht_table[htidx][i] == NULL) { clvector[0]++; continue; } - slots++; + stats->buckets++; /* For each hash entry on this slot... */ - chainlen = 0; + unsigned long chainlen = 0; he = d->ht_table[htidx][i]; while(he) { chainlen++; he = dictGetNext(he); } clvector[(chainlen < DICT_STATS_VECTLEN) ? chainlen : (DICT_STATS_VECTLEN-1)]++; - if (chainlen > maxchainlen) maxchainlen = chainlen; - totchainlen += chainlen; + if (chainlen > stats->maxChainLen) stats->maxChainLen = chainlen; + stats->totalChainLen += chainlen; } - /* Generate human readable stats. */ - l += snprintf(buf+l,bufsize-l, - "Hash table %d stats (%s):\n" - " table size: %lu\n" - " number of elements: %lu\n" - " different slots: %lu\n" - " max chain length: %lu\n" - " avg chain length (counted): %.02f\n" - " avg chain length (computed): %.02f\n" - " Chain length distribution:\n", - htidx, (htidx == 0) ? "main hash table" : "rehashing target", - DICTHT_SIZE(d->ht_size_exp[htidx]), d->ht_used[htidx], slots, maxchainlen, - (float)totchainlen/slots, (float)d->ht_used[htidx]/slots); - - for (i = 0; i < DICT_STATS_VECTLEN-1; i++) { - if (clvector[i] == 0) continue; - if (l >= bufsize) break; - l += snprintf(buf+l,bufsize-l, - " %ld: %ld (%.02f%%)\n", - i, clvector[i], ((float)clvector[i]/DICTHT_SIZE(d->ht_size_exp[htidx]))*100); + return stats; +} + +/* Generates human readable stats. */ +size_t dictGetStatsMsg(char *buf, size_t bufsize, dictStats *stats, int full) { + if (stats->htUsed == 0) { + return snprintf(buf,bufsize, + "Hash table %d stats (%s):\n" + "No stats available for empty dictionaries\n", + stats->htidx, (stats->htidx == 0) ? "main hash table" : "rehashing target"); + } + size_t l = 0; + l += snprintf(buf + l, bufsize - l, + "Hash table %d stats (%s):\n" + " table size: %lu\n" + " number of elements: %lu\n", + stats->htidx, (stats->htidx == 0) ? "main hash table" : "rehashing target", + stats->htSize, stats->htUsed); + if (full) { + l += snprintf(buf + l, bufsize - l, + " different slots: %lu\n" + " max chain length: %lu\n" + " avg chain length (counted): %.02f\n" + " avg chain length (computed): %.02f\n" + " Chain length distribution:\n", + stats->buckets, stats->maxChainLen, + (float) stats->totalChainLen / stats->buckets, (float) stats->htUsed / stats->buckets); + + for (unsigned long i = 0; i < DICT_STATS_VECTLEN - 1; i++) { + if (stats->clvector[i] == 0) continue; + if (l >= bufsize) break; + l += snprintf(buf + l, bufsize - l, + " %ld: %ld (%.02f%%)\n", + i, stats->clvector[i], ((float) stats->clvector[i] / stats->htSize) * 100); + } } /* Make sure there is a NULL term at the end. */ @@ -1598,22 +1762,32 @@ void dictGetStats(char *buf, size_t bufsize, dict *d, int full) { char *orig_buf = buf; size_t orig_bufsize = bufsize; - l = _dictGetStatsHt(buf,bufsize,d,0,full); - if (dictIsRehashing(d) && bufsize > l) { - buf += l; - bufsize -= l; - _dictGetStatsHt(buf,bufsize,d,1,full); + dictStats *mainHtStats = dictGetStatsHt(d, 0, full); + l = dictGetStatsMsg(buf, bufsize, mainHtStats, full); + dictFreeStats(mainHtStats); + buf += l; + bufsize -= l; + if (dictIsRehashing(d) && bufsize > 0) { + dictStats *rehashHtStats = dictGetStatsHt(d, 1, full); + dictGetStatsMsg(buf, bufsize, rehashHtStats, full); + dictFreeStats(rehashHtStats); } /* Make sure there is a NULL term at the end. */ orig_buf[orig_bufsize-1] = '\0'; } +static int dictDefaultCompare(dict *d, const void *key1, const void *key2) { + (void)(d); /*unused*/ + return key1 == key2; +} + /* ------------------------------- Benchmark ---------------------------------*/ #ifdef REDIS_TEST #include "testhelp.h" #define UNUSED(V) ((void) V) +#define TEST(name) printf("test — %s\n", name); uint64_t hashCallback(const void *key) { return dictGenHashFunction((unsigned char*)key, strlen((char*)key)); @@ -1667,8 +1841,10 @@ dictType BenchmarkDictType = { int dictTest(int argc, char **argv, int flags) { long j; long long start, elapsed; + int retval; dict *dict = dictCreate(&BenchmarkDictType); long count = 0; + unsigned long new_dict_size, current_dict_used, remain_keys; int accurate = (flags & REDIS_TEST_ACCURATE); if (argc == 4) { @@ -1681,9 +1857,135 @@ int dictTest(int argc, char **argv, int flags) { count = 5000; } + TEST("Add 16 keys and verify dict resize is ok") { + dictSetResizeEnabled(DICT_RESIZE_ENABLE); + for (j = 0; j < 16; j++) { + retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + assert(retval == DICT_OK); + } + while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); + assert(dictSize(dict) == 16); + assert(dictBuckets(dict) == 16); + } + + TEST("Use DICT_RESIZE_AVOID to disable the dict resize and pad to (dict_force_resize_ratio * 16)") { + /* Use DICT_RESIZE_AVOID to disable the dict resize, and pad + * the number of keys to (dict_force_resize_ratio * 16), so we can satisfy + * dict_force_resize_ratio in next test. */ + dictSetResizeEnabled(DICT_RESIZE_AVOID); + for (j = 16; j < (long)dict_force_resize_ratio * 16; j++) { + retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + assert(retval == DICT_OK); + } + current_dict_used = dict_force_resize_ratio * 16; + assert(dictSize(dict) == current_dict_used); + assert(dictBuckets(dict) == 16); + } + + TEST("Add one more key, trigger the dict resize") { + retval = dictAdd(dict,stringFromLongLong(current_dict_used),(void*)(current_dict_used)); + assert(retval == DICT_OK); + current_dict_used++; + new_dict_size = 1UL << _dictNextExp(current_dict_used); + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 16); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == new_dict_size); + + /* Wait for rehashing. */ + dictSetResizeEnabled(DICT_RESIZE_ENABLE); + while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + } + + TEST("Delete keys until we can trigger shrink in next test") { + /* Delete keys until we can satisfy (1 / HASHTABLE_MIN_FILL) in the next test. */ + for (j = new_dict_size / HASHTABLE_MIN_FILL + 1; j < (long)current_dict_used; j++) { + char *key = stringFromLongLong(j); + retval = dictDelete(dict, key); + zfree(key); + assert(retval == DICT_OK); + } + current_dict_used = new_dict_size / HASHTABLE_MIN_FILL + 1; + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + } + + TEST("Delete one more key, trigger the dict resize") { + current_dict_used--; + char *key = stringFromLongLong(current_dict_used); + retval = dictDelete(dict, key); + zfree(key); + unsigned long oldDictSize = new_dict_size; + new_dict_size = 1UL << _dictNextExp(current_dict_used); + assert(retval == DICT_OK); + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == oldDictSize); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == new_dict_size); + + /* Wait for rehashing. */ + while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + } + + TEST("Empty the dictionary and add 128 keys") { + dictEmpty(dict, NULL); + for (j = 0; j < 128; j++) { + retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + assert(retval == DICT_OK); + } + while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); + assert(dictSize(dict) == 128); + assert(dictBuckets(dict) == 128); + } + + TEST("Use DICT_RESIZE_AVOID to disable the dict resize and reduce to 3") { + /* Use DICT_RESIZE_AVOID to disable the dict reset, and reduce + * the number of keys until we can trigger shrinking in next test. */ + dictSetResizeEnabled(DICT_RESIZE_AVOID); + remain_keys = DICTHT_SIZE(dict->ht_size_exp[0]) / (HASHTABLE_MIN_FILL * dict_force_resize_ratio) + 1; + for (j = remain_keys; j < 128; j++) { + char *key = stringFromLongLong(j); + retval = dictDelete(dict, key); + zfree(key); + assert(retval == DICT_OK); + } + current_dict_used = remain_keys; + assert(dictSize(dict) == remain_keys); + assert(dictBuckets(dict) == 128); + } + + TEST("Delete one more key, trigger the dict resize") { + current_dict_used--; + char *key = stringFromLongLong(current_dict_used); + retval = dictDelete(dict, key); + zfree(key); + new_dict_size = 1UL << _dictNextExp(current_dict_used); + assert(retval == DICT_OK); + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == 128); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == new_dict_size); + + /* Wait for rehashing. */ + dictSetResizeEnabled(DICT_RESIZE_ENABLE); + while (dictIsRehashing(dict)) dictRehashMicroseconds(dict,1000); + assert(dictSize(dict) == current_dict_used); + assert(DICTHT_SIZE(dict->ht_size_exp[0]) == new_dict_size); + assert(DICTHT_SIZE(dict->ht_size_exp[1]) == 0); + } + + TEST("Restore to original state") { + dictEmpty(dict, NULL); + dictSetResizeEnabled(DICT_RESIZE_ENABLE); + } + start_benchmark(); for (j = 0; j < count; j++) { - int retval = dictAdd(dict,stringFromLongLong(j),(void*)j); + retval = dictAdd(dict,stringFromLongLong(j),(void*)j); assert(retval == DICT_OK); } end_benchmark("Inserting"); @@ -1691,7 +1993,7 @@ int dictTest(int argc, char **argv, int flags) { /* Wait for rehashing. */ while (dictIsRehashing(dict)) { - dictRehashMilliseconds(dict,100); + dictRehashMicroseconds(dict,100*1000); } start_benchmark(); @@ -1741,7 +2043,7 @@ int dictTest(int argc, char **argv, int flags) { start_benchmark(); for (j = 0; j < count; j++) { char *key = stringFromLongLong(j); - int retval = dictDelete(dict,key); + retval = dictDelete(dict,key); assert(retval == DICT_OK); key[0] += 17; /* Change first number to letter. */ retval = dictAdd(dict,key,(void*)j); diff --git a/src/dict.h b/src/dict.h index e96cd44eb19..1c0e6accd32 100644 --- a/src/dict.h +++ b/src/dict.h @@ -5,32 +5,11 @@ * tables of power of two in size are used, collisions are handled by * chaining. See the source code for more information... :) * - * Copyright (c) 2006-2012, Salvatore Sanfilippo + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __DICT_H @@ -44,18 +23,33 @@ #define DICT_OK 0 #define DICT_ERR 1 -typedef struct dictEntry dictEntry; /* opaque */ +/* Hash table parameters */ +#define HASHTABLE_MIN_FILL 8 /* Minimal hash table fill 12.5%(100/8) */ +typedef struct dictEntry dictEntry; /* opaque */ typedef struct dict dict; typedef struct dictType { + /* Callbacks */ uint64_t (*hashFunction)(const void *key); void *(*keyDup)(dict *d, const void *key); void *(*valDup)(dict *d, const void *obj); int (*keyCompare)(dict *d, const void *key1, const void *key2); void (*keyDestructor)(dict *d, void *key); void (*valDestructor)(dict *d, void *obj); - int (*expandAllowed)(size_t moreMem, double usedRatio); + int (*resizeAllowed)(size_t moreMem, double usedRatio); + /* Invoked at the start of dict initialization/rehashing (old and new ht are already created) */ + void (*rehashingStarted)(dict *d); + /* Invoked at the end of dict initialization/rehashing of all the entries from old to new ht. Both ht still exists + * and are cleaned up after this callback. */ + void (*rehashingCompleted)(dict *d); + /* Allow a dict to carry extra caller-defined metadata. The + * extra memory is initialized to 0 when a dict is allocated. */ + size_t (*dictMetadataBytes)(dict *d); + + /* Data */ + void *userdata; + /* Flags */ /* The 'no_value' flag, if set, indicates that values are not used, i.e. the * dict is a set. When this flag is set, it's not possible to access the @@ -68,14 +62,32 @@ typedef struct dictType { unsigned int keys_are_odd:1; /* TODO: Add a 'keys_are_even' flag and use a similar optimization if that * flag is set. */ + /* Sometimes we want the ability to store a key in a given way inside the hash + * function, and lookup it in some other way without resorting to any kind of + * conversion. For instance the key may be stored as a structure also + * representing other things, but the lookup happens via just a pointer to a + * null terminated string. Optionally providing additional hash/cmp functions, + * dict supports such usage. In that case we'll have a hashFunction() that will + * expect a null terminated C string, and a storedHashFunction() that will + * instead expect the structure. Similarly, the two comparison functions will + * work differently. The keyCompare() will treat the first argument as a pointer + * to a C string and the other as a structure (this way we can directly lookup + * the structure key using the C string). While the storedKeyCompare() will + * check if two pointers to the key in structure form are the same. + * + * However, functions of dict that gets key as argument (void *key) don't get + * any indication whether it is a lookup or stored key. To indicate that + * you intend to use key of type stored-key, and, consequently, use + * dedicated compare and hash functions of stored-key, is by calling + * dictUseStoredKeyApi(1) before using any of the dict functions that gets + * key as a parameter and then call again dictUseStoredKeyApi(0) once done. + * + * Set to NULL both functions, if you don't want to support this feature. */ + uint64_t (*storedHashFunction)(const void *key); + int (*storedKeyCompare)(dict *d, const void *key1, const void *key2); - /* Allow each dict and dictEntry to carry extra caller-defined metadata. The - * extra memory is initialized to 0 when allocated. */ - size_t (*dictEntryMetadataBytes)(dict *d); - size_t (*dictMetadataBytes)(void); - /* Optional callback called after an entry has been reallocated (due to - * active defrag). Only called if the entry has metadata. */ - void (*afterReplaceEntry)(dict *d, dictEntry *entry); + /* Optional callback called when the dict is destroyed. */ + void (*onDictRelease)(dict *d); } dictType; #define DICTHT_SIZE(exp) ((exp) == -1 ? 0 : (unsigned long)1<<(exp)) @@ -90,12 +102,12 @@ struct dict { long rehashidx; /* rehashing not in progress if rehashidx == -1 */ /* Keep small vars at end for optimal (minimal) struct padding */ - int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */ - signed char ht_size_exp[2]; /* exponent of size. (size = 1<0 rehashing is paused */ - void *metadata[]; /* An arbitrary number of bytes (starting at a - * pointer-aligned address) of size as defined - * by dictType's dictEntryBytes. */ + unsigned useStoredKeyApi : 1; /* See comment of storedHashFunction above */ + signed char ht_size_exp[2]; /* exponent of size. (size = 1<0 automatic resizing is disallowed (<0 indicates coding error) */ + void *metadata[]; }; /* If safe is set to 1 this is a safe iterator, that means, you can call @@ -111,6 +123,16 @@ typedef struct dictIterator { unsigned long long fingerprint; } dictIterator; +typedef struct dictStats { + int htidx; + unsigned long buckets; + unsigned long maxChainLen; + unsigned long totalChainLen; + unsigned long htSize; + unsigned long htUsed; + unsigned long *clvector; +} dictStats; + typedef void (dictScanFunction)(void *privdata, const dictEntry *de); typedef void *(dictDefragAllocFunction)(void *ptr); typedef struct { @@ -138,17 +160,20 @@ typedef struct { (d)->type->keyCompare((d), key1, key2) : \ (key1) == (key2)) -#define dictEntryMetadataSize(d) ((d)->type->dictEntryMetadataBytes \ - ? (d)->type->dictEntryMetadataBytes(d) : 0) -#define dictMetadataSize(d) ((d)->type->dictMetadataBytes \ - ? (d)->type->dictMetadataBytes() : 0) +#define dictMetadata(d) (&(d)->metadata) +#define dictMetadataSize(d) ((d)->type->dictMetadataBytes \ + ? (d)->type->dictMetadataBytes(d) : 0) -#define dictHashKey(d, key) ((d)->type->hashFunction(key)) -#define dictSlots(d) (DICTHT_SIZE((d)->ht_size_exp[0])+DICTHT_SIZE((d)->ht_size_exp[1])) +#define dictBuckets(d) (DICTHT_SIZE((d)->ht_size_exp[0])+DICTHT_SIZE((d)->ht_size_exp[1])) #define dictSize(d) ((d)->ht_used[0]+(d)->ht_used[1]) +#define dictIsEmpty(d) ((d)->ht_used[0] == 0 && (d)->ht_used[1] == 0) #define dictIsRehashing(d) ((d)->rehashidx != -1) #define dictPauseRehashing(d) ((d)->pauserehash++) #define dictResumeRehashing(d) ((d)->pauserehash--) +#define dictIsRehashingPaused(d) ((d)->pauserehash > 0) +#define dictPauseAutoResize(d) ((d)->pauseAutoResize++) +#define dictResumeAutoResize(d) ((d)->pauseAutoResize--) +#define dictUseStoredKeyApi(d, flag) ((d)->useStoredKeyApi = (flag)) /* If our unsigned long type can store a 64 bit number, use a 64 bit PRNG. */ #if ULONG_MAX >= 0xffffffffffffffff @@ -165,9 +190,10 @@ typedef enum { /* API */ dict *dictCreate(dictType *type); +void dictTypeAddMeta(dict **d, dictType *typeWithMeta); int dictExpand(dict *d, unsigned long size); int dictTryExpand(dict *d, unsigned long size); -void *dictMetadata(dict *d); +int dictShrink(dict *d, unsigned long size); int dictAdd(dict *d, void *key, void *val); dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing); void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing); @@ -182,7 +208,8 @@ void dictTwoPhaseUnlinkFree(dict *d, dictEntry *he, dictEntry **plink, int table void dictRelease(dict *d); dictEntry * dictFind(dict *d, const void *key); void *dictFetchValue(dict *d, const void *key); -int dictResize(dict *d); +int dictShrinkIfNeeded(dict *d); +int dictExpandIfNeeded(dict *d); void dictSetKey(dict *d, dictEntry* de, void *key); void dictSetVal(dict *d, dictEntry *de, void *val); void dictSetSignedIntegerVal(dictEntry *de, int64_t val); @@ -216,13 +243,19 @@ uint64_t dictGenCaseHashFunction(const unsigned char *buf, size_t len); void dictEmpty(dict *d, void(callback)(dict*)); void dictSetResizeEnabled(dictResizeEnable enable); int dictRehash(dict *d, int n); -int dictRehashMilliseconds(dict *d, int ms); +int dictRehashMicroseconds(dict *d, uint64_t us); void dictSetHashFunctionSeed(uint8_t *seed); uint8_t *dictGetHashFunctionSeed(void); unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata); unsigned long dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata); uint64_t dictGetHash(dict *d, const void *key); dictEntry *dictFindEntryByPtrAndHash(dict *d, const void *oldptr, uint64_t hash); +void dictRehashingInfo(dict *d, unsigned long long *from_size, unsigned long long *to_size); + +size_t dictGetStatsMsg(char *buf, size_t bufsize, dictStats *stats, int full); +dictStats* dictGetStatsHt(dict *d, int htidx, int full); +void dictCombineStats(dictStats *from, dictStats *into); +void dictFreeStats(dictStats *stats); #ifdef REDIS_TEST int dictTest(int argc, char *argv[], int flags); diff --git a/src/ebuckets.c b/src/ebuckets.c new file mode 100644 index 00000000000..f4f88fadee4 --- /dev/null +++ b/src/ebuckets.c @@ -0,0 +1,2440 @@ +/* + * Copyright Redis Ltd. 2024 - present + * + * Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) + * or the Server Side Public License v1 (SSPLv1). + */ + +#include +#include +#include +#include +#include "zmalloc.h" +#include "redisassert.h" +#include "config.h" +#include "ebuckets.h" + +#define UNUSED(x) (void)(x) + + +/*** DEBUGGING & VALIDATION + * + * To validate DS on add(), remove() and ebExpire() + * #define EB_VALIDATE_DEBUG 1 + */ + +#if (REDIS_TEST || EB_VALIDATE_DEBUG) && !defined(EB_TEST_BENCHMARK) +#define EB_VALIDATE_STRUCTURE(eb, type) ebValidate(eb, type) +#else +#define EB_VALIDATE_STRUCTURE(eb, type) // Do nothing +#endif + +/*** BENCHMARK + * + * To benchmark ebuckets creation and active-expire with 10 million items, apply + * the following command such that `EB_TEST_BENCHMARK` gets desired distribution + * of expiration times: + * + * # 0=1msec, 1=1sec, 2=1min, 3=1hour, 4=1day, 5=1week, 6=1month + * make REDIS_CFLAGS='-DREDIS_TEST -DEB_TEST_BENCHMARK=3' && ./src/redis-server test ebuckets + */ + +/* + * Keep just enough bytes of bucket-key, taking into consideration configured + * EB_BUCKET_KEY_PRECISION, and ignoring LSB bits that has no impact. + * + * The main motivation is that since the bucket-key size determines the maximum + * depth of the rax tree, then we can prune the tree to be more shallow and thus + * reduce the maintenance and traversal of each node in the B-tree. + */ +#if EB_BUCKET_KEY_PRECISION < 8 +#define EB_KEY_SIZE 6 +#elif EB_BUCKET_KEY_PRECISION >= 8 && EB_BUCKET_KEY_PRECISION < 16 +#define EB_KEY_SIZE 5 +#else +#define EB_KEY_SIZE 4 +#endif + +/* + * EB_SEG_MAX_ITEMS - Maximum number of items in rax-segment before trying to + * split. To simplify, it has the same value as EB_LIST_MAX_ITEMS. + */ +#define EB_SEG_MAX_ITEMS 16 +#define EB_LIST_MAX_ITEMS EB_SEG_MAX_ITEMS + +/* From expiration time to bucket-key */ +#define EB_BUCKET_KEY(exptime) ((exptime) >> EB_BUCKET_KEY_PRECISION) + + /* From bucket-key to expiration time */ +#define EB_BUCKET_EXP_TIME(bucketKey) ((uint64_t)(bucketKey) << EB_BUCKET_KEY_PRECISION) + +/*** structs ***/ + +typedef struct CommonSegHdr { + eItem head; +} CommonSegHdr; + + +/* FirstSegHdr - Header of first segment of a bucket. + * + * A bucket in rax tree with a single segment will be as follows: + * + * +-------------+ +------------+ +------------+ + * | FirstSegHdr | | eItem(1) | | eItem(N) | + * [rax] --> | eItem head | --> | void *next | --> ... --> | void *next | --+ + * +-------------+ +------------+ +------------+ | + * ^ | + * | | + * +-------------------------------------------------------+ + * + * Note that the cyclic references assist to update locally the segment(s) without + * the need to "heavy" traversal of the rax tree for each change. + */ +typedef struct FirstSegHdr { + eItem head; /* first item in the list */ + uint32_t totalItems; /* total items in the bucket, across chained segments */ + uint32_t numSegs; /* number of segments in the bucket */ +} FirstSegHdr; + +/* NextSegHdr - Header of next segment in an extended-segment (bucket) + * + * Here is the layout of an extended-segment, after adding another item to a single, + * full (EB_SEG_MAX_ITEMS=16), segment (all items must have same bucket-key value): + * + * +-------------+ +------------+ +------------+ +------------+ +------------+ + * | FirstSegHdr | | eItem(17) | | NextSegHdr | | eItem(1) | | eItem(16) | + * [rax] --> | eItem head | --> | void *next | --> | eItem head | --> | void *next | --> ... --> | void *next | --+ + * +-------------+ +------------+ +------------+ +------------+ +------------+ | + * ^ | ^ | + * | | | | + * +------------- firstSeg / prevSeg -+ +------------------------------------------------------+ + */ +typedef struct NextSegHdr { + eItem head; + CommonSegHdr *prevSeg; /* pointer to previous segment */ + FirstSegHdr *firstSeg; /* pointer to first segment of the bucket */ +} NextSegHdr; + +/* Selective copy of ifndef from server.h instead of including it */ +#ifndef static_assert +#define static_assert(expr, lit) extern char __static_assert_failure[(expr) ? 1:-1] +#endif +/* Verify that "head" field is aligned in FirstSegHdr, NextSegHdr and CommonSegHdr */ +static_assert(offsetof(FirstSegHdr, head) == 0, "FirstSegHdr head is not aligned"); +static_assert(offsetof(NextSegHdr, head) == 0, "FirstSegHdr head is not aligned"); +static_assert(offsetof(CommonSegHdr, head) == 0, "FirstSegHdr head is not aligned"); +/* Verify attached metadata to rax is aligned */ +static_assert(offsetof(rax, metadata) % sizeof(void*) == 0, "metadata field is not aligned in rax"); + +/* EBucketNew - Indicates the caller to create a new bucket following the addition + * of another item to a bucket (either single-segment or extended-segment). */ +typedef struct EBucketNew { + FirstSegHdr segment; + ExpireMeta *mLast; /* last item in the chain */ + uint64_t ebKey; +} EBucketNew; + +static void ebNewBucket(EbucketsType *type, EBucketNew *newBucket, eItem item, uint64_t key); +static int ebBucketPrint(uint64_t bucketKey, EbucketsType *type, FirstSegHdr *firstSeg); +static uint64_t *ebRaxNumItems(rax *rax); + +/*** Static functions ***/ + +/* Extract pointer to list from ebuckets handler */ +static inline rax *ebGetRaxPtr(ebuckets eb) { return (rax *)eb; } + +/* The lsb in ebuckets pointer determines whether the pointer points to rax or list. */ +static inline int ebIsList(ebuckets eb) { + return (((uintptr_t)(void *)eb & 0x1) == 1); +} +/* set lsb in ebuckets pointer to 1 to mark it as list. Unless empty (NULL) */ +static inline ebuckets ebMarkAsList(eItem item) { + if (item == NULL) return item; + + /* either 'itemsAddrAreOdd' or not, we end up with lsb is set to 1 */ + return (void *) ((uintptr_t) item | 1); +} + +/* Extract pointer to the list from ebuckets handler */ +static inline eItem ebGetListPtr(EbucketsType *type, ebuckets eb) { + /* if 'itemsAddrAreOdd' then no need to reset lsb bit */ + if (type->itemsAddrAreOdd) + return eb; + else + return (void*)((uintptr_t)(eb) & ~1); +} + +/* Converts the logical starting time value of a given bucket-key to its equivalent + * "physical" value in the context of an rax tree (rax-key). Although their values + * are the same, their memory layouts differ. The raxKey layout orders bytes in + * memory is from the MSB to the LSB, and the length of the key is EB_KEY_SIZE. */ +static inline void bucketKey2RaxKey(uint64_t bucketKey, unsigned char *raxKey) { + for (int i = EB_KEY_SIZE-1; i >= 0; --i) { + raxKey[i] = (unsigned char) (bucketKey & 0xFF); + bucketKey >>= 8; + } +} + +/* Converts the "physical" value of rax-key to its logical counterpart, representing + * the starting time value of a bucket. The values are equivalent, but their memory + * layouts differ. The raxKey is assumed to be ordered from the MSB to the LSB with + * a length of EB_KEY_SIZE. The resulting bucket-key is the logical representation + * with respect to ebuckets. */ +static inline uint64_t raxKey2BucketKey(unsigned char *raxKey) { + uint64_t bucketKey = 0; + for (int i = 0; i < EB_KEY_SIZE ; ++i) + bucketKey = (bucketKey<<8) + raxKey[i]; + return bucketKey; +} + +/* Add another item to a bucket that consists of extended-segments. In this + * scenario, all items in the bucket share the same bucket-key value and the first + * segment is already full (if not, the function ebSegAddAvail() would have being + * called). This requires the creation of another segment. The layout of the + * segments before and after the addition of the new item is as follows: + * + * Before: [segHdr] -> {item1,..,item16} -> [..] + * After: [segHdr] -> {newItem} -> [nextSegHdr] -> {item1,..,item16} -> [..] + * + * Taken care to persist `segHdr` to be the same instance after the change. + * This is important because the rax tree is pointing to it. */ +static int ebSegAddExtended(EbucketsType *type, FirstSegHdr *firstSegHdr, eItem newItem) { + /* Allocate nextSegHdr and let it take the items of first segment header */ + NextSegHdr *nextSegHdr = zmalloc(sizeof(NextSegHdr)); + nextSegHdr->head = firstSegHdr->head; + /* firstSegHdr will stay the first and new nextSegHdr will follow it */ + nextSegHdr->prevSeg = (CommonSegHdr *) firstSegHdr; + nextSegHdr->firstSeg = firstSegHdr; + + ExpireMeta *mIter = type->getExpireMeta(nextSegHdr->head); + mIter->firstItemBucket = 0; + for (int i = 0 ; i < EB_SEG_MAX_ITEMS-1 ; i++) + mIter = type->getExpireMeta(mIter->next); + + if (mIter->lastItemBucket) { + mIter->next = nextSegHdr; + } else { + /* Update next-next-segment to point back to next-segment */ + NextSegHdr *nextNextSegHdr = mIter->next; + nextNextSegHdr->prevSeg = (CommonSegHdr *) nextSegHdr; + } + + firstSegHdr->numSegs += 1; + firstSegHdr->totalItems += 1; + firstSegHdr->head = newItem; + + ExpireMeta *mNewItem = type->getExpireMeta(newItem); + mNewItem->numItems = 1; + mNewItem->next = nextSegHdr; + mNewItem->firstItemBucket = 1; + mNewItem->lastInSegment = 1; + + return 0; +} + +/* Add another eItem to a segment with available space. Keep items sorted in ascending order */ +static int ebSegAddAvail(EbucketsType *type, FirstSegHdr *seg, eItem item) { + eItem head = seg->head; + ExpireMeta *nextMeta; + ExpireMeta *mHead = type->getExpireMeta(head); + ExpireMeta *mItem = type->getExpireMeta(item); + uint64_t itemExpireTime = ebGetMetaExpTime(mItem); + + seg->totalItems++; + + assert(mHead->numItems < EB_SEG_MAX_ITEMS); + + /* if new item expiry time is smaller than the head then add it before the head */ + if (ebGetMetaExpTime(mHead) > itemExpireTime) { + /* Insert item as the new head */ + mItem->next = head; + mItem->firstItemBucket = mHead->firstItemBucket; + mItem->numItems = mHead->numItems + 1; + mHead->firstItemBucket = 0; + mHead->numItems = 0; + seg->head = item; + return 0; + } + + /* Insert item in the middle of segment */ + ExpireMeta *mIter = mHead; + for (int i = 1 ; i < mHead->numItems ; i++) { + nextMeta = type->getExpireMeta(mIter->next); + /* Insert item in the middle */ + if (ebGetMetaExpTime(nextMeta) > itemExpireTime) { + mHead->numItems = mHead->numItems + 1; + mItem->next = mIter->next; + mIter->next = item; + return 0; + } + mIter = nextMeta; + } + + /* Insert item as the last item of the segment. Inherit flags from previous last item */ + mHead->numItems = mHead->numItems + 1; + mItem->next = mIter->next; + mItem->lastInSegment = mIter->lastInSegment; + mItem->lastItemBucket = mIter->lastItemBucket; + mIter->lastInSegment = 0; + mIter->lastItemBucket = 0; + mIter->next = item; + return 0; +} + +/* Return 1 if split segment to two succeeded. Else, return 0. The only reason + * the split can fail is that All the items in the segment have the same bucket-key */ +static int ebTrySegSplit(EbucketsType *type, FirstSegHdr *seg, EBucketNew *newBucket) { + int minMidDist=(EB_SEG_MAX_ITEMS / 2), bestMiddleIndex = -1; + uint64_t splitKey = -1; + eItem firstItemSecondPart; + ExpireMeta *mLastItemFirstPart, *mFirstItemSecondPart; + + eItem head = seg->head; + ExpireMeta *mHead = type->getExpireMeta(head); + ExpireMeta *mNext, *mIter = mHead; + + /* Search for best middle index to split the segment into two segments. As the + * items are arranged in ascending order, it cannot split between two items that + * have the same expiration time and therefore the split won't necessarily be + * balanced (Or won't be possible to split at all if all have the same exp-time!) + */ + for (int i = 0 ; i < EB_SEG_MAX_ITEMS-1 ; i++) { + //printf ("i=%d\n", i); + mNext = type->getExpireMeta(mIter->next); + if (EB_BUCKET_KEY(ebGetMetaExpTime(mNext)) > EB_BUCKET_KEY( + ebGetMetaExpTime(mIter))) { + /* If found better middle index before reaching halfway, save it */ + if (i < (EB_SEG_MAX_ITEMS/2)) { + splitKey = EB_BUCKET_KEY(ebGetMetaExpTime(mNext)); + bestMiddleIndex = i; + mLastItemFirstPart = mIter; + mFirstItemSecondPart = mNext; + firstItemSecondPart = mIter->next; + minMidDist = (EB_SEG_MAX_ITEMS / 2) - bestMiddleIndex; + } else { + /* after crossing the middle need only to look for the first diff */ + if (minMidDist > (i + 1 - EB_SEG_MAX_ITEMS / 2)) { + splitKey = EB_BUCKET_KEY(ebGetMetaExpTime(mNext)); + bestMiddleIndex = i; + mLastItemFirstPart = mIter; + mFirstItemSecondPart = mNext; + firstItemSecondPart = mIter->next; + minMidDist = i + 1 - EB_SEG_MAX_ITEMS / 2; + } + } + } + mIter = mNext; + } + + /* If cannot find index to split because all with same EB_BUCKET_KEY(), then + * segment should be treated as extended segment */ + if (bestMiddleIndex == -1) + return 0; + + /* New bucket */ + newBucket->segment.head = firstItemSecondPart; + newBucket->segment.numSegs = 1; + newBucket->segment.totalItems = EB_SEG_MAX_ITEMS - bestMiddleIndex - 1; + mFirstItemSecondPart->numItems = EB_SEG_MAX_ITEMS - bestMiddleIndex - 1; + newBucket->mLast = mIter; + newBucket->ebKey = splitKey; + mIter->lastInSegment = 1; + mIter->lastItemBucket = 1; + mIter->next = &newBucket->segment; /* to be updated by caller */ + mFirstItemSecondPart->firstItemBucket = 1; + + /* update existing bucket */ + seg->totalItems = bestMiddleIndex + 1; + mHead->numItems = bestMiddleIndex + 1; + mLastItemFirstPart->lastInSegment = 1; + mLastItemFirstPart->lastItemBucket = 1; + mLastItemFirstPart->next = seg; + return 1; +} + +/* Return 1 if managed to expire the entire segment. Returns 0 otherwise. */ +int ebSingleSegExpire(FirstSegHdr *firstSegHdr, + EbucketsType *type, + ExpireInfo *info, + eItem *updateList) +{ + uint64_t itemExpTime; + eItem iter = firstSegHdr->head; + ExpireMeta *mIter = type->getExpireMeta(iter); + uint32_t i=0, numItemsInSeg = mIter->numItems; + + while (info->itemsExpired < info->maxToExpire) { + itemExpTime = ebGetMetaExpTime(mIter); + + /* Items are arranged in ascending expire-time order in a segment. Stops + * active expiration when an item's expire time is greater than `now`. */ + if (itemExpTime > info->now) + break; + + /* keep aside next before deletion of iter */ + eItem next = mIter->next; + mIter->trash = 1; + ExpireAction act = info->onExpireItem(iter, info->ctx); + + /* if (act == ACT_REMOVE_EXP_ITEM) + * then don't touch the item. Assume it got deleted */ + + /* If indicated to stop then break (cb didn't delete the item) */ + if (act == ACT_STOP_ACTIVE_EXP) { + mIter->trash = 0; + break; + } + + /* If indicated to re-insert the item, then chain it to updateList. + * it will be ebAdd() back to ebuckets at the end of ebExpire() */ + if (act == ACT_UPDATE_EXP_ITEM) { + mIter->next = *updateList; + *updateList = iter; + } + + ++info->itemsExpired; + + /* if deleted all items in segment, delete header and return */ + if (++i == numItemsInSeg) { + zfree(firstSegHdr); + return 1; + } + + /* More items in the segment. Set iter to next item and update mIter */ + iter = next; + mIter = type->getExpireMeta(iter); + } + + /* Update the single-segment with remaining items */ + mIter->numItems = numItemsInSeg - i; + mIter->firstItemBucket = 1; + firstSegHdr->head = iter; + firstSegHdr->totalItems -= i; + + /* Update nextExpireTime */ + info->nextExpireTime = ebGetMetaExpTime(mIter); + + return 0; +} + +/* return 1 if managed to expire the entire segment. Returns 0 otherwise. */ +static int ebSegExpire(FirstSegHdr *firstSegHdr, + EbucketsType *type, + ExpireInfo *info, + eItem *updateList) +{ + eItem iter = firstSegHdr->head; + uint32_t numSegs = firstSegHdr->numSegs; + void *nextSegHdr = firstSegHdr; + + if (numSegs == 1) + return ebSingleSegExpire(firstSegHdr, type, info, updateList); + + /* + * In an extended-segment, there's no need to verify the expiration time of + * each item. This is because all items in an extended-segment share the same + * bucket-key. Therefore, we can remove all items without checking their + * individual expiration times. This is different from a single-segment + * scenario, where items can have different bucket-keys. + */ + for (uint32_t seg=0 ; seg < numSegs ; seg++) { + uint32_t i; + ExpireMeta *mIter = type->getExpireMeta(iter); + uint32_t numItemsInSeg = mIter->numItems; + + for (i = 0; (i < numItemsInSeg) && (info->itemsExpired < info->maxToExpire) ; ++i) { + mIter = type->getExpireMeta(iter); + + /* keep aside `next` before removing `iter` by onExpireItem */ + eItem next = mIter->next; + mIter->trash = 1; + ExpireAction act = info->onExpireItem(iter, info->ctx); + + /* if (act == ACT_REMOVE_EXP_ITEM) + * then don't touch the item. Assume it got deleted */ + + /* If indicated to stop then break (callback didn't delete the item) */ + if (act == ACT_STOP_ACTIVE_EXP) { + mIter->trash = 0; + break; + } + + /* If indicated to re-insert the item, then chain it to updateList. + * it will be ebAdd() back to ebuckets at the end of ebExpire() */ + if (act == ACT_UPDATE_EXP_ITEM) { + mIter->next = *updateList; + *updateList = iter; + } + + /* Item was REMOVED/UPDATED. Advance to `next` item */ + iter = next; + ++info->itemsExpired; + firstSegHdr->totalItems -= 1; + } + + /* if deleted all items in segment */ + if (i == numItemsInSeg) { + /* If not last segment in bucket, then delete segment header */ + if (seg + 1 < numSegs) { + nextSegHdr = iter; + iter = ((NextSegHdr *) nextSegHdr)->head; + zfree(nextSegHdr); + firstSegHdr->numSegs -= 1; + firstSegHdr->head = iter; + mIter = type->getExpireMeta(iter); + mIter->firstItemBucket = 1; + } + } else { + /* We reached here because for-loop above break due to + * ACT_STOP_ACTIVE_EXP or reached maxToExpire */ + firstSegHdr->head = iter; + mIter = type->getExpireMeta(iter); + mIter->numItems = numItemsInSeg - i; + mIter->firstItemBucket = 1; + info->nextExpireTime = ebGetMetaExpTime(mIter); + + /* If deleted one or more segments, update prevSeg of next seg to point firstSegHdr. + * If it is the last segment, then last item need to point firstSegHdr */ + if (seg>0) { + int numItems = mIter->numItems; + for (int i = 0; i < numItems - 1; i++) + mIter = type->getExpireMeta(mIter->next); + + if (mIter->lastItemBucket) { + mIter->next = firstSegHdr; + } else { + /* Update next-segment to point back to firstSegHdr */ + NextSegHdr *nsh = mIter->next; + nsh->prevSeg = (CommonSegHdr *) firstSegHdr; + } + } + + return 0; + } + } + + /* deleted last segment in bucket */ + zfree(firstSegHdr); + return 1; +} + +/*** Static functions of list ***/ + +/* Convert a list to rax. + * + * To create a new rax, the function first converts the list to a segment by + * allocating a segment header and attaching to it the already existing list. + * Then, it adds the new segment to the rax as the first bucket. */ +static rax *ebConvertListToRax(eItem listHead, EbucketsType *type) { + FirstSegHdr *firstSegHdr = zmalloc(sizeof(FirstSegHdr)); + firstSegHdr->head = listHead; + firstSegHdr->totalItems = EB_LIST_MAX_ITEMS ; + firstSegHdr->numSegs = 1; + + /* update last item to point on the segment header */ + ExpireMeta *metaItem = type->getExpireMeta(listHead); + uint64_t bucketKey = EB_BUCKET_KEY(ebGetMetaExpTime(metaItem)); + while (metaItem->lastItemBucket == 0) + metaItem = type->getExpireMeta(metaItem->next); + metaItem->next = firstSegHdr; + + /* Use min expire-time for the first segment in rax */ + unsigned char raxKey[EB_KEY_SIZE]; + bucketKey2RaxKey(bucketKey, raxKey); + rax *rax = raxNewWithMetadata(sizeof(uint64_t)); + *ebRaxNumItems(rax) = EB_LIST_MAX_ITEMS; + raxInsert(rax, raxKey, EB_KEY_SIZE, firstSegHdr, NULL); + return rax; +} + +/** + * Adds another 'item' to the ebucket of type list, keeping the list sorted by + * ascending expiration time. + * + * @param eb - Pointer to the ebuckets handler of type list. Gets updated if the item is + * added as the new head. + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param item - The eItem to be added to the list. + * + * @return 1 if the maximum list length is reached; otherwise, return 0. + */ +static int ebAddToList(ebuckets *eb, EbucketsType *type, eItem item) { + ExpireMeta *metaItem = type->getExpireMeta(item); + + /* if ebucket-list is empty (NULL), then create a new list by marking 'item' + * as the head and tail of the list */ + if (unlikely(ebIsEmpty(*eb))) { + metaItem->next = NULL; + metaItem->numItems = 1; + metaItem->lastInSegment = 1; + metaItem->firstItemBucket = 1; + metaItem->lastItemBucket = 1; + *eb = ebMarkAsList(item); + return 0; + } + + eItem head = ebGetListPtr(type, *eb); + ExpireMeta *metaHead = type->getExpireMeta(head); + + /* If reached max items in list, then return 1 */ + if (metaHead->numItems == EB_LIST_MAX_ITEMS) + return 1; + + /* if expiry time of 'item' is smaller than the head then add it as the new head */ + if (ebGetMetaExpTime(metaHead) > ebGetMetaExpTime(metaItem)) { + /* Insert item as the new head */ + metaItem->next = head; + metaItem->firstItemBucket = 1; + metaItem->numItems = metaHead->numItems + 1; + metaHead->firstItemBucket = 0; + metaHead->numItems = 0; + *eb = ebMarkAsList(item); + return 0; + } + + + /* Try insert item in the middle of list */ + ExpireMeta *mIter = metaHead; + for (int i = 1 ; i < metaHead->numItems ; i++) { + ExpireMeta *nextMeta = type->getExpireMeta(mIter->next); + /* Insert item in the middle */ + if (ebGetMetaExpTime(nextMeta) > ebGetMetaExpTime(metaItem)) { + metaHead->numItems += 1; + metaItem->next = mIter->next; + mIter->next = item; + return 0; + } + mIter = nextMeta; + } + + /* Insert item as the last item of the list. */ + metaHead->numItems += 1; + metaItem->next = NULL; + metaItem->lastInSegment = 1; + metaItem->lastItemBucket = 1; + /* Update obsolete last item */ + mIter->lastInSegment = 0; + mIter->lastItemBucket = 0; + mIter->next = item; + return 0; +} + +/* return 1 if removed from list. Otherwise, return 0 */ +static int ebRemoveFromList(ebuckets *eb, EbucketsType *type, eItem item) { + if (ebIsEmpty(*eb)) + return 0; /* not removed */ + + ExpireMeta *metaItem = type->getExpireMeta(item); + eItem head = ebGetListPtr(type, *eb); + + /* if item is the head of the list */ + if (head == item) { + eItem newHead = metaItem->next; + if (newHead != NULL) { + ExpireMeta *mNewHead = type->getExpireMeta(newHead); + mNewHead->numItems = metaItem->numItems - 1; + mNewHead->firstItemBucket = 1; + *eb = ebMarkAsList(newHead); + return 1; /* removed */ + } + *eb = NULL; + return 1; /* removed */ + } + + /* item is not the head of the list */ + ExpireMeta *metaHead = type->getExpireMeta(head); + + eItem iter = head; + while (iter != NULL) { + ExpireMeta *metaIter = type->getExpireMeta(iter); + if (metaIter->next == item) { + metaIter->next = metaItem->next; + /* If deleted item is the last in the list, then update new last item */ + if (metaItem->next == NULL) { + metaIter->lastInSegment = 1; + metaIter->lastItemBucket = 1; + } + metaHead->numItems -= 1; + return 1; /* removed */ + } + iter = metaIter->next; + } + return 0; /* not removed */ +} + +/* return 1 if none left. Otherwise return 0 */ +static int ebListExpire(ebuckets *eb, + EbucketsType *type, + ExpireInfo *info, + eItem *updateList) +{ + uint32_t expired = 0; + eItem item = ebGetListPtr(type, *eb); + ExpireMeta *metaItem = type->getExpireMeta(item); + uint32_t numItems = metaItem->numItems; /* first item must exists */ + + while (item != NULL) { + metaItem = type->getExpireMeta(item); + uint64_t itemExpTime = ebGetMetaExpTime(metaItem); + + /* Items are arranged in ascending expire-time order in a list. Stops list + * active expiration when an item's expiration time is greater than `now`. */ + if (itemExpTime > info->now) + break; + + if (info->itemsExpired == info->maxToExpire) + break; + + /* keep aside `next` before removing `iter` by onExpireItem */ + eItem *next = metaItem->next; + metaItem->trash = 1; + ExpireAction act = info->onExpireItem(item, info->ctx); + + /* if (act == ACT_REMOVE_EXP_ITEM) + * then don't touch the item. Assume it got deleted */ + + /* If indicated to stop then break (cb didn't delete the item) */ + if (act == ACT_STOP_ACTIVE_EXP) { + metaItem->trash = 0; + break; + } + + /* If indicated to re-insert the item, then chain it to updateList. + * it will be ebAdd() back to ebuckets at the end of ebExpire() */ + if (act == ACT_UPDATE_EXP_ITEM) { + metaItem->next = *updateList; + *updateList = item; + } + + ++expired; + ++(info->itemsExpired); + item = next; + } + + if (expired == numItems) { + *eb = NULL; + info->nextExpireTime = EB_EXPIRE_TIME_INVALID; + return 1; + } + + metaItem->numItems = numItems - expired; + metaItem->firstItemBucket = 1; + info->nextExpireTime = ebGetMetaExpTime(metaItem); + *eb = ebMarkAsList(item); + return 0; +} + +/* Validate the general structure of the list */ +static void ebValidateList(eItem head, EbucketsType *type) { + if (head == NULL) + return; + + ExpireMeta *mHead = type->getExpireMeta(head); + eItem iter = head; + ExpireMeta *mIter = type->getExpireMeta(iter), *mIterPrev = NULL; + + for (int i = 0; i < mHead->numItems ; ++i) { + mIter = type->getExpireMeta(iter); + if (i == 0) { + /* first item */ + assert(mIter->numItems > 0 && mIter->numItems <= EB_LIST_MAX_ITEMS); + assert(mIter->firstItemBucket == 1); + } else { + /* Verify that expire time of previous item is smaller or equal */ + assert(ebGetMetaExpTime(mIterPrev) <= ebGetMetaExpTime(mIter)); + assert(mIter->numItems == 0); + assert(mIter->firstItemBucket == 0); + } + + if (i == (mHead->numItems - 1)) { + /* last item */ + assert(mIter->lastInSegment == 1); + assert(mIter->lastItemBucket == 1); + assert(mIter->next == NULL); + } else { + assert(mIter->lastInSegment == 0); + assert(mIter->lastItemBucket == 0); + assert(mIter->next != NULL); + mIterPrev = mIter; + iter = mIter->next; + } + } +} + +/*** Static functions of ebuckets / rax ***/ + +static uint64_t *ebRaxNumItems(rax *rax) { + return (uint64_t*) rax->metadata; +} + +/* Allocate a single segment with a single item */ +static void ebNewBucket(EbucketsType *type, EBucketNew *newBucket, eItem item, uint64_t key) { + ExpireMeta *mItem = type->getExpireMeta(item); + + newBucket->segment.head = item; + newBucket->segment.totalItems = 1; + newBucket->segment.numSegs = 1; + newBucket->mLast = type->getExpireMeta(item); + newBucket->ebKey = key; + mItem->numItems = 1; + mItem->firstItemBucket = 1; + mItem->lastInSegment = 1; + mItem->lastItemBucket = 1; + mItem->next = &newBucket->segment; +} + +/* + * ebBucketPrint - Prints all the segments in the bucket and time expiration + * of each item in the following fashion: + * + * Bucket(tot=0008,sgs=0001) : [11, 21, 26, 27, 29, 49, 59, 62] + * Bucket(tot=0007,sgs=0001) : [67, 86, 90, 92, 115, 123, 126] + * Bucket(tot=0005,sgs=0001) : [130, 135, 135, 136, 140] + * Bucket(tot=0009,sgs=0002) : [182] + * [162, 163, 167, 168, 172, 177, 183, 186] + * Bucket(tot=0001,sgs=0001) : [193] + */ +static int ebBucketPrint(uint64_t bucketKey, EbucketsType *type, FirstSegHdr *firstSeg) { + eItem iter; + ExpireMeta *mIter, *mHead; + static int PRINT_EXPIRE_META_FLAGS=0; + + iter = firstSeg->head; + mHead = type->getExpireMeta(iter); + + printf("Bucket(key=%06" PRIu64 ",tot=%04d,sgs=%04d) :", bucketKey, firstSeg->totalItems, firstSeg->numSegs); + while (1) { + mIter = type->getExpireMeta(iter); /* not really needed. Just to hash the compiler */ + printf(" ["); + for (int i = 0; i < mHead->numItems ; ++i) { + mIter = type->getExpireMeta(iter); + uint64_t expireTime = ebGetMetaExpTime(mIter); + + if (i == 0 && PRINT_EXPIRE_META_FLAGS) + printf("%" PRIu64 ", ", + expireTime, mIter->numItems, mIter->firstItemBucket, + mIter->lastInSegment, mIter->lastItemBucket); + else if (i == (mHead->numItems - 1) && PRINT_EXPIRE_META_FLAGS) { + printf("%" PRIu64 "", + expireTime, mIter->numItems, mIter->firstItemBucket, + mIter->lastInSegment, mIter->lastItemBucket); + } else + printf("%" PRIu64 "%s", expireTime, (i == mHead->numItems - 1) ? "" : ", "); + + iter = mIter->next; + } + + if (mIter->lastItemBucket) { + printf("]\n"); + break; + } + printf("]\n "); + iter = ((NextSegHdr *) mIter->next)->head; + mHead = type->getExpireMeta(iter); + + } + return 0; +} + +/* Add another eItem to bucket. If needed return 'newBucket' for insertion in rax tree. + * + * 1) If the bucket is based on a single, not full segment, then add the item to the segment. + * 2) If a single, full segment, then try to split it and then add the item. + * 3) If failed to split, then all items in the bucket have the same bucket-key. + * - If the new item has the same bucket-key, then extend the segment to + * be an extended-segment, if not already, and add the item to it. + * - If the new item has a different bucket-key, then allocate a new bucket + * for it. + */ +static int ebAddToBucket(EbucketsType *type, + FirstSegHdr *firstSegBkt, + eItem item, + EBucketNew *newBucket, + uint64_t *updateBucketKey) +{ + newBucket->segment.head = NULL; /* no new bucket as default */ + + if (firstSegBkt->numSegs == 1) { + /* If bucket is a single, not full segment, then add the item to the segment */ + if (firstSegBkt->totalItems < EB_SEG_MAX_ITEMS) + return ebSegAddAvail(type, firstSegBkt, item); + + /* If bucket is a single, full segment, and segment split succeeded */ + if (ebTrySegSplit(type, firstSegBkt, newBucket) == 1) { + /* The split got failed only because all items in the segment have the + * same bucket-key */ + ExpireMeta *mItem = type->getExpireMeta(item); + + /* Check which of the two segments the new item should be added to. Note that + * after the split, bucket-key of `newBucket` is bigger than bucket-key of + * `firstSegBkt`. That is `firstSegBkt` preserves its bucket-key value + * (and its location in rax tree) before the split */ + if (EB_BUCKET_KEY(ebGetMetaExpTime(type->getExpireMeta(item))) < newBucket->ebKey) { + return ebSegAddAvail(type, firstSegBkt, item); + } else { + /* Add the `item` to the new bucket */ + ebSegAddAvail(type, &(newBucket->segment), item); + + /* if new item is now last item in the segment, then update lastItemBucket */ + if (mItem->lastItemBucket) + newBucket->mLast = mItem; + return 0; + } + } + } + + /* If reached here, then either: + * (1) a bucket with multiple segments + * (2) Or, a single, full segment which failed to split. + * + * Either way, all items in the bucket have the same bucket-key value. Thus: + * (A) If 'item' has the same bucket-key as the ones in this bucket, then add it as well + * (B) Else, allocate a new bucket for it. + */ + + ExpireMeta *mHead = type->getExpireMeta(firstSegBkt->head); + ExpireMeta *mItem = type->getExpireMeta(item); + + uint64_t bucketKey = EB_BUCKET_KEY(ebGetMetaExpTime(mHead)); /* same for all items in the segment */ + uint64_t itemKey = EB_BUCKET_KEY(ebGetMetaExpTime(mItem)); + + if (bucketKey == itemKey) { + /* New item has the same bucket-key as the ones in this bucket, Add it as well */ + if (mHead->numItems < EB_SEG_MAX_ITEMS) + return ebSegAddAvail(type, firstSegBkt, item); /* Add item to first segment */ + else { + /* If a regular segment becomes extended-segment, then update the + * bucket-key to be aligned with the expiration-time of the items + * it contains */ + if (firstSegBkt->numSegs == 1) + *updateBucketKey = bucketKey; + + return ebSegAddExtended(type, firstSegBkt, item); /* Add item in a new segment */ + } + } else { + /* If the item cannot be added to the visited (extended-segment) bucket + * because it has a key not equal to bucket-key, then need to allocate a new + * bucket for the item. If the key of the item is below the bucket-key of + * the visited bucket, then the new item will be added to a new segment + * before it and the visited bucket key will be updated to accurately + * reflect the bucket-key of the (extended-segment) bucket */ + if (bucketKey > itemKey) + *updateBucketKey = bucketKey; + + ebNewBucket(type, newBucket, item, EB_BUCKET_KEY(ebGetMetaExpTime(mItem))); + return 0; + } +} + +/* + * Remove item from rax + * + * Return 1 if removed. Otherwise, return 0 + * + * Note: The function is optimized to remove items locally from segments without + * traversing rax tree or stepping long extended-segments. Therefore, it is + * assumed that the item is present in the bucket without verification. + * + * TODO: Written straightforward. Should be optimized to merge small segments. + */ +static int ebRemoveFromRax(ebuckets *eb, EbucketsType *type, eItem item) { + ExpireMeta *mItem = type->getExpireMeta(item); + rax *rax = ebGetRaxPtr(*eb); + + /* if item is the only one left in a single-segment bucket, then delete bucket */ + if (unlikely(mItem->firstItemBucket && mItem->lastItemBucket)) { + raxIterator ri; + raxStart(&ri, rax); + unsigned char raxKey[EB_KEY_SIZE]; + bucketKey2RaxKey(EB_BUCKET_KEY(ebGetMetaExpTime(mItem)), raxKey); + raxSeek(&ri, "<=", raxKey, EB_KEY_SIZE); + + if (raxNext(&ri) == 0) + return 0; /* not removed */ + + FirstSegHdr *segHdr = ri.data; + + if (segHdr->head != item) + return 0; /* not removed */ + + zfree(segHdr); + raxRemove(ri.rt, ri.key, EB_KEY_SIZE, NULL); + raxStop(&ri); + + /* If last bucket in rax, then delete the rax */ + if (rax->numele == 0) { + raxFree(rax); + *eb = NULL; + return 1; /* removed */ + } + } else if (mItem->numItems == 1) { + /* If the `item` is the only one in its segment, there must be additional + * items and segments in this bucket. If there weren't, the item would + * have been removed by the previous condition. */ + + if (mItem->firstItemBucket) { + /* If the first item/segment in extended-segments, then + * - Remove current segment (with single item) and promote next-segment to be first. + * - Update first item of next-segment to be firstItemBucket + * - Update `prevSeg` next-of-next segment to point new header of next-segment + * - Update FirstSegHdr to totalItems-1, numSegs-1 */ + NextSegHdr *nextHdr = mItem->next; + FirstSegHdr *firstHdr = (FirstSegHdr *) nextHdr->prevSeg; + firstHdr->head = nextHdr->head; + firstHdr->totalItems--; + firstHdr->numSegs--; + zfree(nextHdr); + eItem *iter = firstHdr->head; + ExpireMeta *mIter = type->getExpireMeta(iter); + mIter->firstItemBucket = 1; + while (mIter->lastInSegment == 0) { + iter = mIter->next; + mIter = type->getExpireMeta(iter); + } + if (mIter->lastItemBucket) + mIter->next = firstHdr; + else + ((NextSegHdr *) mIter->next)->prevSeg = (CommonSegHdr *) firstHdr; + + } else if (mItem->lastItemBucket) { + /* If last item/segment in bucket, then + * - promote previous segment to be last segment + * - Update FirstSegHdr to totalItems-1, numSegs-1 */ + NextSegHdr *currHdr = mItem->next; + CommonSegHdr *prevHdr = currHdr->prevSeg; + eItem iter = prevHdr->head; + ExpireMeta *mIter = type->getExpireMeta(iter); + while (mIter->lastInSegment == 0) { + iter = mIter->next; + mIter = type->getExpireMeta(iter); + } + currHdr->firstSeg->totalItems--; + currHdr->firstSeg->numSegs--; + mIter->next = prevHdr; + mIter->lastItemBucket = 1; + zfree(currHdr); + + } else { + /* item/segment is not the first or last item/segment. + * - Update previous segment to point next segment. + * - Update `prevSeg` of next segment + * - Update FirstSegHdr to totalItems-1, numSegs-1 */ + NextSegHdr *nextHdr = mItem->next; + NextSegHdr *currHdr = (NextSegHdr *) nextHdr->prevSeg; + CommonSegHdr *prevHdr = currHdr->prevSeg; + + ExpireMeta *mIter = type->getExpireMeta(prevHdr->head); + while (mIter->lastInSegment == 0) + mIter = type->getExpireMeta(mIter->next); + + mIter->next = nextHdr; + nextHdr->prevSeg = prevHdr; + nextHdr->firstSeg->totalItems--; + nextHdr->firstSeg->numSegs--; + zfree(currHdr); + + } + } else { + /* At least 2 items in current segment */ + if (mItem->numItems) { + /* If item is first item in segment (Must be numItems>1), then + * - Find segment header and update to point next item. + * - Let next inherit 'item' flags {firstItemBucket, numItems-1} + * - Update FirstSegHdr to totalItems-1 */ + ExpireMeta *mIter = mItem; + CommonSegHdr *currHdr; + while (mIter->lastInSegment == 0) + mIter = type->getExpireMeta(mIter->next); + if (mIter->lastItemBucket) + currHdr = (CommonSegHdr *) mIter->next; + else + currHdr = (CommonSegHdr *) ((NextSegHdr *) mIter->next)->prevSeg; + + if (mItem->firstItemBucket) + ((FirstSegHdr *) currHdr)->totalItems--; + else + ((NextSegHdr *) currHdr)->firstSeg->totalItems--; + + eItem *newHead = mItem->next; + ExpireMeta *mNewHead = type->getExpireMeta(newHead); + mNewHead->firstItemBucket = mItem->firstItemBucket; + mNewHead->numItems = mItem->numItems - 1; + currHdr->head = newHead; + + } else if (mItem->lastInSegment) { + /* If item is last in segment, then + * - find previous item and let it inherit (next, lastInSegment, lastItemBucket) + * - Find and update segment header to numItems-1 + * - Update FirstSegHdr to totalItems-1 */ + CommonSegHdr *currHdr; + if (mItem->lastItemBucket) + currHdr = (CommonSegHdr *) mItem->next; + else + currHdr = (CommonSegHdr *) ((NextSegHdr *) mItem->next)->prevSeg; + + ExpireMeta *mHead = type->getExpireMeta(currHdr->head); + mHead->numItems--; + ExpireMeta *mIter = mHead; + while (mIter->next != item) + mIter = type->getExpireMeta(mIter->next); + + mIter->next = mItem->next; + mIter->lastInSegment = mItem->lastInSegment; + mIter->lastItemBucket = mItem->lastItemBucket; + + if (mHead->firstItemBucket) + ((FirstSegHdr *) currHdr)->totalItems--; + else + ((NextSegHdr *) currHdr)->firstSeg->totalItems--; + + } else { + /* - Item is in the middle of segment. Find previous item and update to point next. + * - Find and Update segment header to numItems-1 + * - Update FirstSegHdr to totalItems-1 */ + ExpireMeta *mIter = mItem; + CommonSegHdr *currHdr; + while (mIter->lastInSegment == 0) + mIter = type->getExpireMeta(mIter->next); + if (mIter->lastItemBucket) + currHdr = (CommonSegHdr *) mIter->next; + else + currHdr = (CommonSegHdr *) ((NextSegHdr *) mIter->next)->prevSeg; + + ExpireMeta *mHead = type->getExpireMeta(currHdr->head); + mHead->numItems--; + mIter = mHead; + while (mIter->next != item) + mIter = type->getExpireMeta(mIter->next); + + mIter->next = mItem->next; + mIter->lastInSegment = mItem->lastInSegment; + mIter->lastItemBucket = mItem->lastItemBucket; + + if (mHead->firstItemBucket) + ((FirstSegHdr *) currHdr)->totalItems--; + else + ((NextSegHdr *) currHdr)->firstSeg->totalItems--; + } + } + *ebRaxNumItems(rax) -= 1; + return 1; /* removed */ +} + +int ebAddToRax(ebuckets *eb, EbucketsType *type, eItem item, uint64_t bucketKeyItem) { + EBucketNew newBucket; /* ebAddToBucket takes care to update newBucket.segment.head */ + raxIterator iter; + unsigned char raxKey[EB_KEY_SIZE]; + bucketKey2RaxKey(bucketKeyItem, raxKey); + rax *rax = ebGetRaxPtr(*eb); + raxStart(&iter,rax); + raxSeek(&iter, "<=", raxKey, EB_KEY_SIZE); + *ebRaxNumItems(rax) += 1; + /* If expireTime of the item is below the bucket-key of first bucket in rax, + * then need to add it as a new bucket at the beginning of the rax. */ + if(raxNext(&iter) == 0) { + FirstSegHdr *firstSegHdr = zmalloc(sizeof(FirstSegHdr)); + firstSegHdr->head = item; + firstSegHdr->totalItems = 1; + firstSegHdr->numSegs = 1; + + /* update last item to point on the segment header */ + ExpireMeta *metaItem = type->getExpireMeta(item); + metaItem->lastItemBucket = 1; + metaItem->lastInSegment = 1; + metaItem->firstItemBucket = 1; + metaItem->numItems = 1; + metaItem->next = firstSegHdr; + bucketKey2RaxKey(bucketKeyItem, raxKey); + raxInsert(rax, raxKey, EB_KEY_SIZE, firstSegHdr, NULL); + raxStop(&iter); + return 0; + } + + /* Add the new item into the first segment of the bucket that we found */ + uint64_t updateBucketKey = 0; + ebAddToBucket(type, iter.data, item, &newBucket, &updateBucketKey); + + /* If following the addition need to `updateBucketKey` of `foundBucket` in rax */ + if(unlikely(updateBucketKey && updateBucketKey != raxKey2BucketKey(iter.key))) { + raxRemove(iter.rt, iter.key, EB_KEY_SIZE, NULL); + bucketKey2RaxKey(updateBucketKey, raxKey); + raxInsert(iter.rt, raxKey, EB_KEY_SIZE, iter.data, NULL); + } + + /* If ebAddToBucket() returned a new bucket, then add the bucket to rax. + * + * This might happen when trying to add another item to a bucket that is: + * 1. A single, full segment. Will result in a bucket (segment) split. + * 2. Extended segment with a different bucket-key than the new item. + * Will result in a new bucket (of size 1) for the new item. + */ + if (newBucket.segment.head != NULL) { + /* Allocate segment header for the new bucket */ + FirstSegHdr *newSeg = zmalloc(sizeof(FirstSegHdr)); + /* Move the segment from 'newBucket' to allocated segment header */ + *newSeg = newBucket.segment; + /* Update 'next' of last item in segment to point to 'FirstSegHdr` */ + newBucket.mLast->next = newSeg; + /* Insert the new bucket to rax */ + bucketKey2RaxKey(newBucket.ebKey, raxKey); + raxInsert(iter.rt, raxKey, EB_KEY_SIZE, newSeg, NULL); + } + + raxStop(&iter); + return 0; +} + +/* Validate the general structure of the buckets in rax */ +static void ebValidateRax(rax *rax, EbucketsType *type) { + uint64_t numItemsTotal = 0; + raxIterator raxIter; + raxStart(&raxIter, rax); + raxSeek(&raxIter, "^", NULL, 0); + while (raxNext(&raxIter)) { + int expectFirstItemBucket = 1; + FirstSegHdr *firstSegHdr = raxIter.data; + eItem iter; + ExpireMeta *mIter, *mHead; + iter = firstSegHdr->head; + mHead = type->getExpireMeta(iter); + uint64_t numItemsBucket = 0, countSegments = 0; + + int extendedSeg = (firstSegHdr->numSegs > 1) ? 1 : 0; + void *segHdr = firstSegHdr; + + mIter = type->getExpireMeta(iter); + while (1) { + uint64_t curBktKey, prevBktKey; + for (int i = 0; i < mHead->numItems ; ++i) { + assert(iter != NULL); + mIter = type->getExpireMeta(iter); + curBktKey = EB_BUCKET_KEY(ebGetMetaExpTime(mIter)); + + if (i == 0) { + assert(mIter->numItems > 0 && mIter->numItems <= EB_SEG_MAX_ITEMS); + assert(mIter->firstItemBucket == expectFirstItemBucket); + expectFirstItemBucket = 0; + prevBktKey = curBktKey; + } else { + assert( (extendedSeg && prevBktKey == curBktKey) || + (!extendedSeg && prevBktKey <= curBktKey) ); + assert(mIter->numItems == 0); + assert(mIter->firstItemBucket == 0); + prevBktKey = curBktKey; + } + + if (i == mHead->numItems - 1) + assert(mIter->lastInSegment == 1); + else + assert(mIter->lastInSegment == 0); + + iter = mIter->next; + } + + numItemsBucket += mHead->numItems; + countSegments += 1; + + if (mIter->lastItemBucket) + break; + + NextSegHdr *nextSegHdr = mIter->next; + assert(nextSegHdr->firstSeg == firstSegHdr); + assert(nextSegHdr->prevSeg == segHdr); + iter = nextSegHdr->head; + mHead = type->getExpireMeta(iter); + segHdr = nextSegHdr; + } + /* Verify next of last item, `totalItems` and `numSegs` in iterated bucket */ + assert(mIter->next == segHdr); + assert(numItemsBucket == firstSegHdr->totalItems); + assert(countSegments == firstSegHdr->numSegs); + numItemsTotal += numItemsBucket; + } + raxStop(&raxIter); + assert(numItemsTotal == *ebRaxNumItems(rax)); +} + +struct deleteCbCtx { EbucketsType *type; void *userCtx; }; +void ebRaxDeleteCb(void *item, void *context) { + struct deleteCbCtx *ctx = context; + FirstSegHdr *firstSegHdr = item; + eItem itemIter = firstSegHdr->head; + uint32_t numSegs = firstSegHdr->numSegs; + void *nextSegHdr = firstSegHdr; + + for (uint32_t seg=0 ; seg < numSegs ; seg++) { + zfree(nextSegHdr); + + ExpireMeta *mIter = ctx->type->getExpireMeta(itemIter); + uint32_t numItemsInSeg = mIter->numItems; + + for (uint32_t i = 0; i < numItemsInSeg ; ++i) { + mIter = ctx->type->getExpireMeta(itemIter); + eItem toDelete = itemIter; + mIter->trash = 1; + itemIter = mIter->next; + if (ctx->type->onDeleteItem) ctx->type->onDeleteItem(toDelete, &ctx->userCtx); + } + nextSegHdr = itemIter; + + if (seg + 1 < numSegs) + itemIter = ((NextSegHdr *) nextSegHdr)->head; + } + +} + +static void _ebPrint(ebuckets eb, EbucketsType *type, int64_t usedMem, int printItems) { + if (ebIsEmpty(eb)) { + printf("Empty ebuckets\n"); + return; + } + + if (ebIsList(eb)) { + /* mock rax segment */ + eItem head = ebGetListPtr(type, eb); + ExpireMeta *metaHead = type->getExpireMeta(head); + FirstSegHdr mockSeg = { head, metaHead->numItems, 1}; + if (printItems) + ebBucketPrint(0, type, &mockSeg); + return; + } + + uint64_t totalItems = 0; + uint64_t numBuckets = 0; + uint64_t numSegments = 0; + + rax *rax = ebGetRaxPtr(eb); + raxIterator iter; + raxStart(&iter, rax); + raxSeek(&iter, "^", NULL, 0); + while (raxNext(&iter)) { + FirstSegHdr *seg = iter.data; + if (printItems) + ebBucketPrint(raxKey2BucketKey(iter.key), type, seg); + totalItems += seg->totalItems; + numBuckets++; + numSegments += seg->numSegs; + } + + printf("Total number of items : %" PRIu64 "\n", totalItems); + printf("Total number of buckets : %" PRIu64 "\n", numBuckets); + printf("Total number of segments : %" PRIu64 "\n", numSegments); + printf("Average items per bucket : %.2f\n", + (double) totalItems / numBuckets); + printf("Average items per segment : %.2f\n", + (double) totalItems / numSegments); + printf("Average segments per bucket : %.2f\n", + (double) numSegments / numBuckets); + + if (usedMem != -1) + { + printf("\nEbuckets memory usage (including FirstSegHdr/NexSegHdr):\n"); + printf("Total : %.2f KBytes\n", + (double) usedMem / 1024); + printf("Average per bucket : %" PRIu64 " Bytes\n", + usedMem / numBuckets); + printf("Average per item : %" PRIu64 " Bytes\n", + usedMem / totalItems); + printf("EB_BUCKET_KEY_PRECISION : %d\n", + EB_BUCKET_KEY_PRECISION); + printf("EB_SEG_MAX_ITEMS : %d\n", + EB_SEG_MAX_ITEMS); + } + raxStop(&iter); +} + +/*** API functions ***/ + +/** + * Deletes all items from given ebucket, invoking optional item deletion callbacks. + * + * @param eb - The ebucket to be deleted. + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param ctx - A context pointer that can be used in optional item deletion callbacks. + */ +void ebDestroy(ebuckets *eb, EbucketsType *type, void *ctx) { + if (ebIsEmpty(*eb)) + return; + + if (ebIsList(*eb)) { + eItem head = ebGetListPtr(type, *eb); + eItem *pItemNext = &head; + while ( (*pItemNext) != NULL) { + eItem toDelete = *pItemNext; + ExpireMeta *metaToDelete = type->getExpireMeta(toDelete); + *pItemNext = metaToDelete->next; + metaToDelete->trash = 1; + if (type->onDeleteItem) type->onDeleteItem(toDelete, ctx); + } + } else { + struct deleteCbCtx deleteCtx = {type, ctx}; + raxFreeWithCbAndContext(ebGetRaxPtr(*eb), ebRaxDeleteCb, &deleteCtx); + } + + *eb = NULL; +} + +/** + * Removes the specified item from the given ebucket, updating the ebuckets handler + * accordingly. The function is optimized to remove items locally from segments + * without traversing rax tree or stepping long extended-segments. Therefore, + * it is assumed that the item is present in the bucket without verification. + * + * @param eb - Pointer to the ebuckets handler, which may get updated if the removal + * affects the structure. + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param item - The eItem to be removed from the ebucket. + * + * @return 1 if the item was successfully removed; otherwise, return 0. + */ +int ebRemove(ebuckets *eb, EbucketsType *type, eItem item) { + + if (ebIsEmpty(*eb)) + return 0; /* not removed */ + + int res; + if (ebIsList(*eb)) + res = ebRemoveFromList(eb, type, item); + else /* rax */ + res = ebRemoveFromRax(eb, type, item); + + /* if removed then mark as trash */ + if (res) + type->getExpireMeta(item)->trash = 1; + + EB_VALIDATE_STRUCTURE(*eb, type); + + return res; +} + +/** + * Adds the specified item to the ebucket structure based on expiration time. + * If the ebucket is a list or empty, it attempts to add the item to the list. + * Otherwise, it adds the item to rax. If the list reaches its maximum size, it + * is converted to rax. The ebuckets handler may be updated accordingly. + * + * @param eb - Pointer to the ebuckets handler, which may get updated + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param item - The eItem to be added to the ebucket. + * @param expireTime - The expiration time of the item. + * + * @return 0 (C_OK) if the item was successfully added; + * Otherwise, return -1 (C_ERR) on failure. + */ +int ebAdd(ebuckets *eb, EbucketsType *type, eItem item, uint64_t expireTime) { + int res; + + assert(expireTime <= EB_EXPIRE_TIME_MAX); + + /* Set expire-time and reset segment flags */ + ExpireMeta *itemMeta = type->getExpireMeta(item); + ebSetMetaExpTime(itemMeta, expireTime); + itemMeta->lastInSegment = 0; + itemMeta->firstItemBucket = 0; + itemMeta->lastItemBucket = 0; + itemMeta->numItems = 0; + itemMeta->trash = 0; + + if (ebIsList(*eb) || (ebIsEmpty(*eb))) { + /* Try add item to list */ + if ( (res = ebAddToList(eb, type, item)) == 1) { + /* Failed to add since list reached maximum size. Convert to rax */ + *eb = ebConvertListToRax(ebGetListPtr(type, *eb), type); + res = ebAddToRax(eb, type, item, EB_BUCKET_KEY(expireTime)); + } + } else { + /* Add item to rax */ + res = ebAddToRax(eb, type, item, EB_BUCKET_KEY(expireTime)); + } + + EB_VALIDATE_STRUCTURE(*eb, type); + + return res; +} + +/** + * Performs expiration on the given ebucket, removing items that have expired. + * + * If all items in the data structure are expired, 'eb' will be set to NULL. + * + * @param eb - Pointer to the ebuckets handler, which may get updated + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param info - Providing information about the expiration action. + */ +void ebExpire(ebuckets *eb, EbucketsType *type, ExpireInfo *info) { + /* updateList - maintain a list of expired items that the callback `onExpireItem` + * indicated to update their expiration time rather than removing them. + * At the end of this function, the items will be `ebAdd()` back. + * + * Note, this list of items does not allocate any memory, but temporary reuses + * the `next` pointer of the `ExpireMeta` structure of the expired items. */ + eItem updateList = NULL; + + /* reset info outputs */ + info->nextExpireTime = EB_EXPIRE_TIME_INVALID; + info->itemsExpired = 0; + + /* if empty ebuckets */ + if (ebIsEmpty(*eb)) return; + + if (ebIsList(*eb)) { + ebListExpire(eb, type, info, &updateList); + goto END_ACTEXP; + } + + /* handle rax expiry */ + + rax *rax = ebGetRaxPtr(*eb); + raxIterator iter; + + raxStart(&iter, rax); + + uint64_t nowKey = EB_BUCKET_KEY(info->now); + uint64_t itemsExpiredBefore = info->itemsExpired; + + while (1) { + raxSeek(&iter,"^",NULL,0); + if (!raxNext(&iter)) break; + + uint64_t bucketKey = raxKey2BucketKey(iter.key); + + FirstSegHdr *firstSegHdr = iter.data; + + /* We need to take into consideration EB_BUCKET_KEY_PRECISION. The value of + * "info->now" will be adjusted to lookup only for all buckets with assigned + * keys that are older than 1<now). */ + if (bucketKey >= nowKey) { + /* Take care to update next expire time based on next segment to expire */ + info->nextExpireTime = ebGetMetaExpTime( + type->getExpireMeta(firstSegHdr->head)); + break; + } + + /* If not managed to remove entire bucket then return */ + if (ebSegExpire(firstSegHdr, type, info, &updateList) == 0) + break; + + raxRemove(iter.rt, iter.key, EB_KEY_SIZE, NULL); + } + + raxStop(&iter); + *ebRaxNumItems(rax) -= info->itemsExpired - itemsExpiredBefore; + + if(raxEOF(&iter) && (updateList == 0)) { + raxFree(rax); + *eb = NULL; + } + +END_ACTEXP: + /* Add back items with updated expiration time */ + while (updateList) { + ExpireMeta *mItem = type->getExpireMeta(updateList); + eItem next = mItem->next; + uint64_t expireAt = ebGetMetaExpTime(mItem); + + /* Update next minimum expire time if needed. + * Condition is valid also if nextExpireTime is EB_EXPIRE_TIME_INVALID */ + if (expireAt < info->nextExpireTime) + info->nextExpireTime = expireAt; + + ebAdd(eb, type, updateList, expireAt); + updateList = next; + } + + EB_VALIDATE_STRUCTURE(*eb, type); + + return; +} + +/* Performs active expiration dry-run to evaluate number of expired items + * + * It is faster than actual active-expire because it iterates only over the + * headers of the buckets until the first non-expired bucket, and no more than + * EB_SEG_MAX_ITEMS items in the last bucket + * + * @param eb - The ebucket to be checked. + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * @param now - The current time in milliseconds. + */ +uint64_t ebExpireDryRun(ebuckets eb, EbucketsType *type, uint64_t now) { + if (ebIsEmpty(eb)) return 0; + + uint64_t numExpired = 0; + + /* If list, then iterate and count expired ones */ + if (ebIsList(eb)) { + ExpireMeta *mIter = type->getExpireMeta(ebGetListPtr(type, eb)); + while (1) { + if (ebGetMetaExpTime(mIter) >= now) + return numExpired; + + numExpired++; + + if (mIter->lastInSegment) + return numExpired; + + mIter = type->getExpireMeta(mIter->next); + } + } + + /* Handle rax active-expire */ + rax *rax = ebGetRaxPtr(eb); + raxIterator iter; + raxStart(&iter, rax); + uint64_t nowKey = EB_BUCKET_KEY(now); + raxSeek(&iter,"^",NULL,0); + assert(raxNext(&iter)); /* must be at least one bucket */ + FirstSegHdr *currBucket = iter.data; + + while (1) { + /* if 'currBucket' is last bucket, then break */ + if(!raxNext(&iter)) break; + FirstSegHdr *nextBucket = iter.data; + + /* if 'nextBucket' is not less than now then break */ + if (raxKey2BucketKey(iter.key) >= nowKey) break; + + /* nextBucket less than now. For sure all items in currBucket are expired */ + numExpired += currBucket->totalItems; + currBucket = nextBucket; + } + raxStop(&iter); + + /* If single segment bucket, iterate over items and count expired ones */ + if (currBucket->numSegs == 1) { + ExpireMeta *mIter = type->getExpireMeta(currBucket->head); + while (1) { + if (ebGetMetaExpTime(mIter) >= now) + return numExpired; + + numExpired++; + + if (mIter->lastInSegment) + return numExpired; + + mIter = type->getExpireMeta(mIter->next); + } + } + + /* Bucket key exactly reflect expiration time of all items (currBucket->numSegs > 1) */ + if (EB_BUCKET_KEY_PRECISION == 0) { + if (ebGetMetaExpTime(type->getExpireMeta(currBucket->head)) >= now) + return numExpired; + else + return numExpired + currBucket->totalItems; + } + + /* Iterate extended-segment and count expired ones */ + + /* Unreachable code, provided for completeness. Following operation is not + * bound in time and this is the main reason why we set above + * EB_BUCKET_KEY_PRECISION to 0 and have early return on previous condition */ + + ExpireMeta *mIter = type->getExpireMeta(currBucket->head); + while(1) { + if (ebGetMetaExpTime(mIter) < now) + numExpired++; + + if (mIter->lastItemBucket) + return numExpired; + + if (mIter->lastInSegment) + mIter = type->getExpireMeta(((NextSegHdr *) mIter->next)->head); + else + mIter = type->getExpireMeta(mIter->next); + } +} + +/** + * Retrieves the expiration time of the item with the nearest expiration + * + * @param eb - The ebucket to be checked. + * @param type - Pointer to the EbucketsType structure defining the type of ebucket. + * + * @return The expiration time of the item with the nearest expiration time in + * the ebucket. If empty, return EB_EXPIRE_TIME_INVALID. If ebuckets is + * of type rax and minimal bucket is extended-segment, then it might not + * return accurate result up-to 1<getExpireMeta(ebGetListPtr(type, eb))); + + /* rax */ + uint64_t minExpire; + rax *rax = ebGetRaxPtr(eb); + raxIterator iter; + raxStart(&iter, rax); + raxSeek(&iter, "^", NULL, 0); + raxNext(&iter); /* seek to the last bucket */ + FirstSegHdr *firstSegHdr = iter.data; + if ((firstSegHdr->numSegs == 1) || (EB_BUCKET_KEY_PRECISION == 0)) { + /* Single segment, or extended-segments that all have same expiration time. + * return the first item with the nearest expiration time */ + minExpire = ebGetMetaExpTime(type->getExpireMeta(firstSegHdr->head)); + } else { + + /* If reached here, then it is because it is extended segment and buckets + * are with lower precision than 1msec. In that case it is better not to + * iterate extended-segments, which might be unbounded, and just return + * worst possible expiration time in this bucket. + * + * The reason we return blindly worst case expiration time value in this + * bucket is because the only usage of this function is to figure out + * when is the next time active expiration should be performed, and it + * is better to do it only after 1 or more items were expired and not the + * other way around. + */ + uint64_t expTime = ebGetMetaExpTime(type->getExpireMeta(firstSegHdr->head)); + minExpire = expTime | ( (1<getExpireMeta(item); + while (em->lastInSegment == 0) + em = type->getExpireMeta(em->next); + return ebGetMetaExpTime(em); + } + + /* rax */ + uint64_t maxExpire; + rax *rax = ebGetRaxPtr(eb); + raxIterator iter; + raxStart(&iter, rax); + raxSeek(&iter, "$", NULL, 0); + raxNext(&iter); /* seek to the last bucket */ + FirstSegHdr *firstSegHdr = iter.data; + if (firstSegHdr->numSegs == 1) { + /* Single segment. return the last item with the highest expiration time */ + ExpireMeta *em = type->getExpireMeta(firstSegHdr->head); + while (em->lastInSegment == 0) + em = type->getExpireMeta(em->next); + maxExpire = ebGetMetaExpTime(em); + } else if (EB_BUCKET_KEY_PRECISION == 0) { + /* Extended-segments that all have same expiration time */ + maxExpire = ebGetMetaExpTime(type->getExpireMeta(firstSegHdr->head)); + } else { + if (accurate == 0) { + /* return upper limit of the last bucket */ + int mask = (1<getExpireMeta(firstSegHdr->head)); + maxExpire = (expTime + (mask+1)) & (~mask); + } else { + maxExpire = 0; + ExpireMeta *mIter = type->getExpireMeta(firstSegHdr->head); + while(1) { + while(1) { + if (maxExpire < ebGetMetaExpTime(mIter)) + maxExpire = ebGetMetaExpTime(mIter); + if (mIter->lastInSegment == 1) break; + mIter = type->getExpireMeta(mIter->next); + } + + if (mIter->lastItemBucket) break; + mIter = type->getExpireMeta(((NextSegHdr *) mIter->next)->head); + } + } + } + raxStop(&iter); + return maxExpire; +} + +/** + * Retrieves the total number of items in the ebucket. + */ +uint64_t ebGetTotalItems(ebuckets eb, EbucketsType *type) { + if (ebIsEmpty(eb)) + return 0; + + if (ebIsList(eb)) + return type->getExpireMeta(ebGetListPtr(type, eb))->numItems; + else + return *ebRaxNumItems(ebGetRaxPtr(eb)); +} + +/* print expiration-time of items, ebuckets layout and some statistics */ +void ebPrint(ebuckets eb, EbucketsType *type) { + _ebPrint(eb, type, -1, 1); +} + +/* Validate the general structure of ebuckets. Calls assert(0) on error. */ +void ebValidate(ebuckets eb, EbucketsType *type) { + if (ebIsEmpty(eb)) + return; + + if (ebIsList(eb)) + ebValidateList(ebGetListPtr(type, eb), type); + else + ebValidateRax(ebGetRaxPtr(eb), type); +} + +/* Reallocates the memory used by the item using the provided allocation function. + * This feature was added for the active defrag feature. + * + * The 'defragfn' callbacks are called with a pointer to memory that callback + * can reallocate. The callbacks should return a new memory address or NULL, + * where NULL means that no reallocation happened and the old memory is still valid. + * + * Note: It is the caller's responsibility to ensure that the item has a valid expire time. */ +eItem ebDefragItem(ebuckets *eb, EbucketsType *type, eItem item, ebDefragFunction *defragfn) { + assert(!ebIsEmpty(*eb)); + if (ebIsList(*eb)) { + ExpireMeta *prevem = NULL; + eItem curitem = ebGetListPtr(type, *eb); + while (curitem != NULL) { + if (curitem == item) { + if ((curitem = defragfn(curitem))) { + if (prevem) + prevem->next = curitem; + else + *eb = ebMarkAsList(curitem); + } + return curitem; + } + + /* Move to the next item in the list. */ + prevem = type->getExpireMeta(curitem); + curitem = prevem->next; + } + } else { + CommonSegHdr *currHdr; + ExpireMeta *mIter = type->getExpireMeta(item); + assert(mIter->trash != 1); + while (mIter->lastInSegment == 0) + mIter = type->getExpireMeta(mIter->next); + + if (mIter->lastItemBucket) + currHdr = (CommonSegHdr *) mIter->next; + else + currHdr = (CommonSegHdr *) ((NextSegHdr *) mIter->next)->prevSeg; + /* If the item is the first in the segment, then update the segment header */ + if (currHdr->head == item) { + if ((item = defragfn(item))) { + currHdr->head = item; + } + return item; + } + + /* Iterate over all items in the segment until the next is 'item' */ + ExpireMeta *mHead = type->getExpireMeta(currHdr->head); + mIter = mHead; + while (mIter->next != item) + mIter = type->getExpireMeta(mIter->next); + assert(mIter->next == item); + + if ((item = defragfn(item))) { + mIter->next = item; + } + return item; + } + redis_unreachable(); +} + +/* Retrieves the expiration time associated with the given item. If associated + * ExpireMeta is marked as trash, then return EB_EXPIRE_TIME_INVALID */ +uint64_t ebGetExpireTime(EbucketsType *type, eItem item) { + ExpireMeta *meta = type->getExpireMeta(item); + if (unlikely(meta->trash)) return EB_EXPIRE_TIME_INVALID; + return ebGetMetaExpTime(meta); +} + +/*** Unit tests ***/ + +#ifdef REDIS_TEST +#include +#include +#include +#include +#include "testhelp.h" + +#define TEST(name) printf("[TEST] >>> %s\n", name); +#define TEST_COND(name, cond) printf("[%s] >>> %s\n", (cond) ? "TEST" : "BYPS", name); if (cond) + +typedef struct MyItem { + int index; + ExpireMeta mexpire; +} MyItem; + +typedef struct TimeRange { + uint64_t start; + uint64_t end; +} TimeRange; + +ExpireMeta *getMyItemExpireMeta(const eItem item) { + return &((MyItem *)item)->mexpire; +} + +ExpireAction expireItemCb(void *item, eItem ctx); +void deleteItemCb(eItem item, void *ctx); +EbucketsType myEbucketsType = { + .getExpireMeta = getMyItemExpireMeta, + .onDeleteItem = deleteItemCb, + .itemsAddrAreOdd = 0, +}; + +EbucketsType myEbucketsType2 = { + .getExpireMeta = getMyItemExpireMeta, + .onDeleteItem = NULL, + .itemsAddrAreOdd = 0, +}; + +/* XOR over all items time-expiration. Must be 0 after all addition/removal */ +uint64_t expItemsHashValue = 0; + +ExpireAction expireItemCb(eItem item, void *ctx) { + ExpireMeta *meta = myEbucketsType.getExpireMeta(item); + uint64_t expTime = ebGetMetaExpTime(meta); + expItemsHashValue = expItemsHashValue ^ expTime; + + TimeRange *range = (TimeRange *) ctx; + /* Verify expiration time is within the range */ + if (range != NULL) assert(expTime >= range->start && expTime <= range->end); + +/* If benchmarking then avoid from heavyweight free operation. It is user side logic */ +#ifndef EB_TEST_BENCHMARK + zfree(item); +#endif + return ACT_REMOVE_EXP_ITEM; +} + +ExpireAction expireUpdateThirdItemCb(eItem item, void *ctx) { + uint64_t expTime = (uint64_t) (uintptr_t) ctx; + static int calls = 0; + if ((calls++) == 3) { + ebSetMetaExpTime(&(((MyItem *)item)->mexpire), expTime ); + return ACT_UPDATE_EXP_ITEM; + } + + return ACT_REMOVE_EXP_ITEM; +} + +void deleteItemCb(eItem item, void *ctx) { + UNUSED(ctx); + zfree(item); +} + +void addItems(ebuckets *eb, uint64_t startExpire, int step, uint64_t numItems, MyItem **ar) { + for (uint64_t i = 0 ; i < numItems ; i++) { + uint64_t expireTime = startExpire + (i * step); + expItemsHashValue = expItemsHashValue ^ expireTime; + MyItem *item = zmalloc(sizeof(MyItem)); + if (ar) ar[i] = item; + ebAdd(eb, &myEbucketsType, item, expireTime); + } +} + +/* expireRanges - is given as bucket-key to be agnostic to the different configuration + * of EB_BUCKET_KEY_PRECISION */ +void distributeTest(int lowestTime, + uint64_t *expireRanges, + const int *ItemsPerRange, + int numRanges, + int isExpire, + int printStat) { + struct timeval timeBefore, timeAfter, timeDryRun, timeCreation, timeDestroy; + ebuckets eb = ebCreate(); + + /* create items with random expiry */ + uint64_t startRange = lowestTime; + + expItemsHashValue = 0; + void *listOfItems = NULL; + for (int i = 0; i < numRanges; i++) { + uint64_t endRange = EB_BUCKET_EXP_TIME(expireRanges[i]); + for (int j = 0; j < ItemsPerRange[i]; j++) { + uint64_t randomExpirey = (rand() % (endRange - startRange)) + startRange; + expItemsHashValue = expItemsHashValue ^ (uint32_t) randomExpirey; + MyItem *item = zmalloc(sizeof(MyItem)); + getMyItemExpireMeta(item)->next = listOfItems; + listOfItems = item; + ebSetMetaExpTime(getMyItemExpireMeta(item), randomExpirey); + } + startRange = EB_BUCKET_EXP_TIME(expireRanges[i]); /* next start range */ + } + + /* Take to sample memory after all items allocated and before insertion to ebuckets */ + size_t usedMemBefore = zmalloc_used_memory(); + + gettimeofday(&timeBefore, NULL); + while (listOfItems) { + MyItem *item = (MyItem *)listOfItems; + listOfItems = getMyItemExpireMeta(item)->next; + uint64_t expireTime = ebGetMetaExpTime(&item->mexpire); + ebAdd(&eb, &myEbucketsType, item, expireTime); + } + gettimeofday(&timeAfter, NULL); + timersub(&timeAfter, &timeBefore, &timeCreation); + + gettimeofday(&timeBefore, NULL); + ebExpireDryRun(eb, &myEbucketsType, 0xFFFFFFFFFFFF); /* expire dry-run all */ + gettimeofday(&timeAfter, NULL); + timersub(&timeAfter, &timeBefore, &timeDryRun); + + if (printStat) { + _ebPrint(eb, &myEbucketsType, zmalloc_used_memory() - usedMemBefore, 0); + } + + gettimeofday(&timeBefore, NULL); + if (isExpire) { + startRange = lowestTime; + /* Active expire according to the ranges */ + for (int i = 0 ; i < numRanges ; i++) { + + /* When checking how many items are expired, we need to take into + * consideration EB_BUCKET_KEY_PRECISION. The value of "info->now" + * will be adjusted by ebActiveExpire() to lookup only for all buckets + * with assigned keys that are older than 1<now) and not "<=". + * But if there is a list behind ebuckets, then this limitation is not + * applied and the operator "<=" will be used instead. + * + * The '-1' in case of list brings makes both cases aligned to have + * same result */ + uint64_t now = EB_BUCKET_EXP_TIME(expireRanges[i]) + (ebIsList(eb) ? -1 : 0); + + TimeRange range = {EB_BUCKET_EXP_TIME(startRange), EB_BUCKET_EXP_TIME(expireRanges[i]) }; + ExpireInfo info = { + .maxToExpire = 0xFFFFFFFF, + .onExpireItem = expireItemCb, + .ctx = &range, + .now = now, + .itemsExpired = 0}; + + ebExpire(&eb, &myEbucketsType, &info); + + assert( (eb==NULL && (i + 1 == numRanges)) || (eb!=NULL && (i + 1 < numRanges)) ); + assert( info.itemsExpired == (uint64_t) ItemsPerRange[i]); + startRange = expireRanges[i]; + } + assert(eb == NULL); + assert( (expItemsHashValue & 0xFFFFFFFF) == 0); + } + ebDestroy(&eb, &myEbucketsType, NULL); + gettimeofday(&timeAfter, NULL); + timersub(&timeAfter, &timeBefore, &timeDestroy); + + if (printStat) { + printf("Time elapsed ebuckets creation : %ld.%06ld\n", (long int)timeCreation.tv_sec, (long int)timeCreation.tv_usec); + printf("Time elapsed active-expire dry-run : %ld.%06ld\n", (long int)timeDryRun.tv_sec, (long int)timeDryRun.tv_usec); + if (isExpire) + printf("Time elapsed active-expire : %ld.%06ld\n", (long int)timeDestroy.tv_sec, (long int)timeDestroy.tv_usec); + else + printf("Time elapsed destroy : %ld.%06ld\n", (long int)timeDestroy.tv_sec, (long int)timeDestroy.tv_usec); + } + +} + +#define UNUSED(x) (void)(x) +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +eItem defragCallback(const eItem item) { + size_t size = zmalloc_usable_size(item); + eItem newitem = zmalloc(size); + memcpy(newitem, item, size); + zfree(item); + return newitem; +} + +int ebucketsTest(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + srand(0); + + int verbose = (flags & REDIS_TEST_VERBOSE) ? 2 : 1; + UNUSED(verbose); + +#ifdef EB_TEST_BENCHMARK + TEST("ebuckets - benchmark 10 million items: alloc + add + activeExpire") { + + struct TestParams { + uint64_t minExpire; + uint64_t maxExpire; + int items; + const char *description; + } testCases[] = { + { 1805092100000, 1805092100000 + (uint64_t) 1, 10000000, "1 msec distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000, 10000000, "1 sec distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000*60, 10000000, "1 min distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000*60*60, 10000000, "1 hour distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000*60*60*24, 10000000, "1 day distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000*60*60*24*7, 10000000, "1 week distribution" }, + { 1805092100000, 1805092100000 + (uint64_t) 1000*60*60*24*30, 10000000, "1 month distribution" } + }; + + /* selected test */ + uint32_t tid = EB_TEST_BENCHMARK; + + printf("\n------ TEST EBUCKETS: %s ------\n", testCases[tid].description); + uint64_t expireRanges[] = { testCases[tid].minExpire, testCases[tid].maxExpire }; + int itemsPerRange[] = { 0, testCases[tid].items }; + + /* expireRanges[] is provided to distributeTest() as bucket-key values */ + for (uint32_t j = 0; j < ARRAY_SIZE(expireRanges); ++j) { + expireRanges[j] = expireRanges[j] >> EB_BUCKET_KEY_PRECISION; + } + + distributeTest(0, expireRanges, itemsPerRange, ARRAY_SIZE(expireRanges), 1, 1); + return 0; + } +#endif + + TEST("list - Create a single item, get TTL, and remove") { + MyItem *singleItem = zmalloc(sizeof(MyItem)); + ebuckets eb = NULL; + ebAdd(&eb, &myEbucketsType, singleItem, 1000); + assert(ebGetExpireTime(&myEbucketsType, singleItem) == 1000 ); + + /* remove the item */ + assert(ebRemove(&eb, &myEbucketsType, singleItem)); + /* now the ebuckets is empty */ + assert(ebRemove(&eb, &myEbucketsType, singleItem) == 0); + + zfree(singleItem); + + ebDestroy(&eb, &myEbucketsType, NULL); + } + + TEST("list - Create few items on different times, get TTL, and then remove") { + MyItem *items[EB_LIST_MAX_ITEMS]; + ebuckets eb = NULL; + for (int i = 0 ; i < EB_LIST_MAX_ITEMS ; i++) { + items[i] = zmalloc(sizeof(MyItem)); + ebAdd(&eb, &myEbucketsType, items[i], i); + } + + for (uint64_t i = 0 ; i < EB_LIST_MAX_ITEMS ; i++) { + assert(ebGetExpireTime(&myEbucketsType, items[i]) == i ); + assert(ebRemove(&eb, &myEbucketsType, items[i])); + } + + for (int i = 0 ; i < EB_LIST_MAX_ITEMS ; i++) { + zfree(items[i]); + } + + ebDestroy(&eb, &myEbucketsType, NULL); + } + + TEST("list - Create few items on different times, get TTL, and then delete") { + MyItem *items[EB_LIST_MAX_ITEMS]; + ebuckets eb = NULL; + for (int i = 0 ; i < EB_LIST_MAX_ITEMS ; i++) { + items[i] = zmalloc(sizeof(MyItem)); + ebAdd(&eb, &myEbucketsType, items[i], i); + } + + for (uint64_t i = 0 ; i < EB_LIST_MAX_ITEMS ; i++) { + assert(ebGetExpireTime(&myEbucketsType, items[i]) == i ); + } + + ebDestroy(&eb, &myEbucketsType, NULL); + } + + TEST_COND("ebuckets - Add items with increased/decreased expiration time and then expire", + EB_BUCKET_KEY_PRECISION > 0) + { + ebuckets eb = NULL; + + for (int isDecr = 0; isDecr < 2; ++isDecr) { + for (uint32_t numItems = 1; numItems < 64; ++numItems) { + uint64_t step = 1 << EB_BUCKET_KEY_PRECISION; + + if (isDecr == 0) + addItems(&eb, 0, step, numItems, NULL); + else + addItems(&eb, (numItems - 1) * step, -step, numItems, NULL); + + for (uint32_t i = 1; i <= numItems; i++) { + TimeRange range = {EB_BUCKET_EXP_TIME(i - 1), EB_BUCKET_EXP_TIME(i)}; + ExpireInfo info = { + .maxToExpire = 1, + .onExpireItem = expireItemCb, + .ctx = &range, + .now = EB_BUCKET_EXP_TIME(i), + .itemsExpired = 0}; + + ebExpire(&eb, &myEbucketsType, &info); + assert(info.itemsExpired == 1); + if (i == numItems) { /* if last item */ + assert(eb == NULL); + assert(info.nextExpireTime == EB_EXPIRE_TIME_INVALID); + } else { + assert(info.nextExpireTime == EB_BUCKET_EXP_TIME(i)); + } + } + } + } + } + + TEST_COND("ebuckets - Create items with same expiration time and then expire", + EB_BUCKET_KEY_PRECISION > 0) + { + ebuckets eb = NULL; + uint64_t expirePerIter = 2; + for (uint32_t numIterations = 1; numIterations < 100; ++numIterations) { + uint32_t numItems = numIterations * expirePerIter; + uint64_t expireTime = (1 << EB_BUCKET_KEY_PRECISION) + 1; + addItems(&eb, expireTime, 0, numItems, NULL); + + for (uint32_t i = 1; i <= numIterations; i++) { + ExpireInfo info = { + .maxToExpire = expirePerIter, + .onExpireItem = expireItemCb, + .ctx = NULL, + .now = (2 << EB_BUCKET_KEY_PRECISION), + .itemsExpired = 0}; + ebExpire(&eb, &myEbucketsType, &info); + assert(info.itemsExpired == expirePerIter); + if (i == numIterations) { /* if last item */ + assert(eb == NULL); + assert(info.nextExpireTime == EB_EXPIRE_TIME_INVALID); + } else { + assert(info.nextExpireTime == expireTime); + } + } + } + } + + TEST("list - Create few items on random times and then expire/delete ") { + for (int isExpire = 0 ; isExpire <= 1 ; ++isExpire ) { + uint64_t expireRanges[] = {1000}; /* bucket-keys */ + int itemsPerRange[] = {EB_LIST_MAX_ITEMS}; + distributeTest(0, expireRanges, itemsPerRange, + ARRAY_SIZE(expireRanges), isExpire, 0); + } + } + + TEST("list - Create few items (list) on same time and then active expire/delete ") { + for (int isExpire = 0 ; isExpire <= 1 ; ++isExpire ) { + uint64_t expireRanges[] = {1, 2}; /* bucket-keys */ + int itemsPerRange[] = {0, EB_LIST_MAX_ITEMS}; + + distributeTest(0, expireRanges, itemsPerRange, + ARRAY_SIZE(expireRanges), isExpire, 0); + } + } + + TEST("ebuckets - Create many items on same time and then active expire/delete ") { + for (int isExpire = 1 ; isExpire <= 1 ; ++isExpire ) { + uint64_t expireRanges[] = {1, 2}; /* bucket-keys */ + int itemsPerRange[] = {0, 20}; + + distributeTest(0, expireRanges, itemsPerRange, + ARRAY_SIZE(expireRanges), isExpire, 0); + } + } + + TEST("ebuckets - Create items on different times and then expire/delete ") { + for (int isExpire = 0 ; isExpire <= 0 ; ++isExpire ) { + for (int numItems = 1 ; numItems < 100 ; ++numItems ) { + uint64_t expireRanges[] = {1000000}; /* bucket-keys */ + int itemsPerRange[] = {numItems}; + distributeTest(0, expireRanges, itemsPerRange, + ARRAY_SIZE(expireRanges), 1, 0); + } + } + } + + TEST("ebuckets - Create items on different times and then ebRemove() ") { + ebuckets eb = NULL; + + for (int step = -1 ; step <= 1 ; ++step) { + for (int numItems = 1; numItems <= EB_SEG_MAX_ITEMS*3; ++numItems) { + for (int offset = 0; offset < numItems; offset++) { + MyItem *items[numItems]; + uint64_t startValue = 1000 << EB_BUCKET_KEY_PRECISION; + int stepValue = step * (1 << EB_BUCKET_KEY_PRECISION); + addItems(&eb, startValue, stepValue, numItems, items); + for (int i = 0; i < numItems; i++) { + int at = (i + offset) % numItems; + assert(ebRemove(&eb, &myEbucketsType, items[at])); + zfree(items[at]); + } + assert(eb == NULL); + } + } + } + } + + TEST("ebuckets - test min/max expire time") { + ebuckets eb = NULL; + MyItem items[3*EB_SEG_MAX_ITEMS]; + for (int numItems = 1 ; numItems < (int)ARRAY_SIZE(items) ; numItems++) { + uint64_t minExpTime = RAND_MAX, maxExpTime = 0; + for (int i = 0; i < numItems; i++) { + /* generate random expiration time */ + uint64_t expireTime = rand(); + if (expireTime < minExpTime) minExpTime = expireTime; + if (expireTime > maxExpTime) maxExpTime = expireTime; + ebAdd(&eb, &myEbucketsType2, items + i, expireTime); + assert(ebGetNextTimeToExpire(eb, &myEbucketsType2) == minExpTime); + assert(ebGetMaxExpireTime(eb, &myEbucketsType2, 0) == maxExpTime); + } + ebDestroy(&eb, &myEbucketsType2, NULL); + } + } + + TEST_COND("ebuckets - test min/max expire time, with extended-segment", + (1< 2*EB_SEG_MAX_ITEMS) { + ebuckets eb = NULL; + MyItem items[(2*EB_SEG_MAX_ITEMS)-1]; + for (int numItems = EB_SEG_MAX_ITEMS+1 ; numItems < (int)ARRAY_SIZE(items) ; numItems++) { + /* First reach extended-segment (two chained segments in a bucket) */ + for (int i = 0; i <= EB_SEG_MAX_ITEMS; i++) { + uint64_t itemExpireTime = (1<index = i; + ebAdd(&eb, &myEbucketsType, items[i], i); + } + assert((s <= EB_LIST_MAX_ITEMS) ? ebIsList(eb) : !ebIsList(eb)); + /* Defrag all the items. */ + for (int i = 0; i < s; i++) { + MyItem *newitem = ebDefragItem(&eb, &myEbucketsType, items[i], defragCallback); + if (newitem) items[i] = newitem; + } + /* Verify that the data is not corrupted. */ + ebValidate(eb, &myEbucketsType); + for (int i = 0; i < s; i++) + assert(items[i]->index == i); + ebDestroy(&eb, &myEbucketsType, NULL); + } + } + +// TEST("segment - Add smaller item to full segment that all share same ebucket-key") +// TEST("segment - Add item to full segment and make it extended-segment (all share same ebucket-key)") +// TEST("ebuckets - Create rax tree with extended-segment and add item before") + + return 0; +} + +#endif diff --git a/src/ebuckets.h b/src/ebuckets.h new file mode 100644 index 00000000000..fbcae8fd153 --- /dev/null +++ b/src/ebuckets.h @@ -0,0 +1,306 @@ +/* + * Copyright Redis Ltd. 2024 - present + * + * Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) + * or the Server Side Public License v1 (SSPLv1). + * + * + * WHAT IS EBUCKETS? + * ----------------- + * ebuckets is being used to store items that are set with expiration-time. It + * supports the basic API of add, remove and active expiration. The implementation + * of it is based on rax-tree, or plain linked-list when small. The expiration time + * of the items are used as the key to traverse rax-tree. + * + * Instead of holding a distinct item in each leaf of the rax-tree we can aggregate + * items into small segments and hold it in each leaf. This way we can avoid + * frequent modification of the rax-tree, since many of the modifications + * will be done only at the segment level. It will also save memory because + * rax-tree can be costly, around 40 bytes per leaf (with rax-key limited to 6 + * bytes). Whereas each additional item in the segment will cost the size of the + * 'next' pointer in a list (8 bytes) and few more bytes for maintenance of the + * segment. + * + * EBUCKETS STRUCTURE + * ------------------ + * The ebuckets data structure is organized in a hierarchical manner as follows: + * + * 1. ebuckets: This is the top-level data structure. It can be either a rax tree + * or a plain linked list. It contains one or more buckets, each representing + * an interval in time. + * + * 2. bucket: Each bucket represents an interval in time and contains one or more + * segments. The key in the rax-tree for each bucket represents low + * bound expiration-time for the items within this bucket. The key of the + * following bucket represents the upper bound expiration-time. + * + * 3. segment: Each segment within a bucket can hold up to `EB_SEG_MAX_ITEMS` + * items as a linked list. If there are more, the segment will try to + * split the bucket. To avoid wasting memory, it is a singly linked list (only + * next-item pointer). It is a cyclic linked-list to allow efficient removal of + * items from the middle of the segment without traversing the rax tree. + * + * 4. item: Each item that is stored in ebuckets should embed the ExpireMeta + * struct and supply getter function (see EbucketsType.getExpireMeta). This + * struct holds the expire-time of the item and few more fields that are used + * to maintain the segments data-structure. + * + * SPLITTING BUCKET + * ---------------- + * Each segment can hold up-to `EB_SEG_MAX_ITEMS` items. On insertion of new + * item, it will try to split the segment. Here is an example For adding item + * with expiration of 42 to a segment that already reached its maximum capacity + * which will cause to split of the segment and in turn split of the bucket as + * well to a finer grained ranges: + * + * BUCKETS BUCKETS + * [ 00-10 ] -> size(Seg0) = 11 ==> [ 00-10 ] -> size(Seg0) = 11 + * [ 11-76 ] -> size(Seg1) = 16 [ 11-36 ] -> size(Seg1) = 9 + * [ 37-76 ] -> size(Seg2) = 7 + * + * EXTENDING BUCKET + * ---------------- + * In the example above, the reason it wasn't split evenly is that Seg1 must have + * been holding items with same TTL and they must reside together in the same + * bucket after the split. Which brings us to another important point. If there + * is a segment that reached its maximum capacity and all the items have same + * expiration-time key, then we cannot split the bucket but aggregate all the + * items, with same expiration time key, by allocating an extended-segment and + * chain it to the first segment in visited bucket. In that sense, extended + * segments will only hold items with same expiration-time key. + * + * BUCKETS BUCKETS + * [ 00-10 ] -> size(Seg0)=11 ==> [ 00-10 ] -> size(Seg0)=11 + * [ 11-12 ] -> size(Seg1)=16 [ 11-12 ] -> size(Seg1)=1 -> size(Seg2)=16 + * + * LIMITING RAX TREE DEPTH + * ----------------------- + * The rax tree is basically a B-tree and its depth is bounded by the sizeof of + * the key. Holding 6 bytes for expiration-time key is more than enough to represent + * unix-time in msec, and in turn the depth of the tree is limited to 6 levels. + * At a first glance it might look sufficient but we need take into consideration + * the heavyweight maintenance and traversal of each node in the B-tree. + * + * And so, we can further prune the tree such that holding keys with msec precision + * in the tree doesn't bring with it much value. The active-expiration operation can + * live with deletion of expired items, say, older than 1 sec, which means the size + * of time-expiration keys to the rax tree become no more than ~4.5 bytes and we + * also get rid of the "noisy" bits which most probably will cause to yet another + * branching and modification of the rax tree in case of items with time-expiration + * difference of less than 1 second. The lazy expiration will still be precise and + * without compromise on accuracy because the exact expiration-time is kept + * attached as well to each item, in `ExpireMeta`, and each traversal of item with + * expiration will behave as expected down to the msec. Take care to configure + * `EB_BUCKET_KEY_PRECISION` according to your needs. + * + * EBUCKET KEY + * ----------- + * Taking into account configured value of `EB_BUCKET_KEY_PRECISION`, two items + * with expiration-time t1 and t2 will be considered to have the same key in the + * rax-tree/buckets if and only if: + * + * EB_BUCKET_KEY(t1) == EB_BUCKET_KEY(t2) + * + * EBUCKETS CREATION + * ----------------- + * To avoid the cost of allocating rax data-structure for only few elements, + * ebuckets will start as a simple linked-list and only when it reaches some + * threshold, it will be converted to rax. + * + * TODO + * ---- + * - ebRemove() optimize to merge small segments into one segment. + * - ebAdd() Fix pathological case of cascade addition of items into rax such + * that their values are smaller/bigger than visited extended-segment which ends + * up with multiple segments with a single item in each segment. + */ + +#ifndef __EBUCKETS_H +#define __EBUCKETS_H + +#include +#include +#include +#include +#include "rax.h" + +/* + * EB_BUCKET_KEY_PRECISION - Defines the number of bits to ignore from the + * expiration-time when mapping to buckets. The higher the value, the more items + * with similar expiration-time will be aggregated into the same bucket. The lower + * the value, the more "accurate" the active expiration of buckets will be. + * + * Note that the accurate time expiration of each item is preserved anyway and + * enforced by lazy expiration. It only impacts the active expiration that will + * be able to work on buckets older than (1<> EB_BUCKET_KEY_PRECISION) + + +#define EB_EXPIRE_TIME_MAX ((uint64_t)0x0000FFFFFFFFFFFF) /* Maximum expire-time. */ +#define EB_EXPIRE_TIME_INVALID (EB_EXPIRE_TIME_MAX+1) /* assumed bigger than max */ + +/* Handler to ebuckets DS. Pointer to a list, rax or NULL (empty DS). See also ebIsList(). */ +typedef void *ebuckets; + +/* Users of ebuckets will store `eItem` which is just a void pointer to their + * element. In addition, eItem should embed the ExpireMeta struct and supply + * getter function (see EbucketsType.getExpireMeta). + */ +typedef void *eItem; + +/* This struct Should be embedded inside `eItem` and must be aligned in memory. */ +typedef struct ExpireMeta { + /* 48bits of unix-time in msec. This value is sufficient to represent, in + * unix-time, until the date of 02 August, 10889 + */ + uint32_t expireTimeLo; /* Low bits of expireTime. */ + uint16_t expireTimeHi; /* High bits of expireTime. */ + + unsigned int lastInSegment : 1; /* Last item in segment. If set, then 'next' will + point to the NextSegHdr, unless lastItemBucket=1 + then it will point to segment header of the + current segment. */ + unsigned int firstItemBucket : 1; /* First item in bucket. This flag assist + to manipulate segments directly without + the need to traverse from start the + rax tree */ + unsigned int lastItemBucket : 1; /* Last item in bucket. This flag assist + to manipulate segments directly without + the need to traverse from start the + rax tree */ + unsigned int numItems : 5; /* Only first item in segment will maintain + this value. */ + + unsigned int trash : 1; /* This flag indicates whether the ExpireMeta + associated with the item is leftover. + There is always a potential to reuse the + item after removal/deletion. Note that, + the user can still safely O(1) TTL lookup + a given item and verify whether attached + TTL is valid or leftover. See function + ebGetExpireTime(). */ + + unsigned int userData : 3; /* ebuckets can be used to store in same + instance few different types of items, + such as, listpack and hash. This field + is reserved to store such identification + associated with the item and can help + to distinct on delete or expire callback. + It is not used by ebuckets internally and + should be maintained by the user */ + + unsigned int reserved : 4; + + void *next; /* - If not last item in segment then next + points to next eItem (lastInSegment=0). + - If last in segment but not last in + bucket (lastItemBucket=0) then it + points to next segment header. + - If last in bucket then it points to + current segment header (Can be either + of type FirstSegHdr or NextSegHdr). */ +} ExpireMeta; + +/* Each instance of ebuckets need to have corresponding EbucketsType that holds + * the necessary callbacks and configuration to operate correctly on the type + * of items that are stored in it. Conceptually it should have hold reference + * from ebuckets instance to this type, but to save memory we will pass it as + * an argument to each API call. */ +typedef struct EbucketsType { + /* getter to extract the ExpireMeta from the item */ + ExpireMeta* (*getExpireMeta)(const eItem item); + + /* Called during ebDestroy(). Set to NULL if not needed. */ + void (*onDeleteItem)(eItem item, void *ctx); + + /* Is addresses of items are odd in memory. It is taken into consideration + * and used by ebuckets to know how to distinct between ebuckets pointer to + * rax versus a pointer to item which is head of list. */ + unsigned int itemsAddrAreOdd; +} EbucketsType; + +/* Returned value by `onExpireItem` callback to indicate the action to be taken by + * ebExpire(). */ +typedef enum ExpireAction { + ACT_REMOVE_EXP_ITEM=0, /* Remove the item from ebuckets. */ + ACT_UPDATE_EXP_ITEM, /* Re-insert the item with updated expiration-time. + Before returning this value, the cb need to + update expiration time of the item by assisting + function ebSetMetaExpTime(). The item will be + kept aside and will be added again to ebuckets + at the end of ebExpire() */ + ACT_STOP_ACTIVE_EXP /* Stop active-expiration. It will assume that + provided 'item' wasn't deleted by the callback. */ +} ExpireAction; + +/* ExpireInfo is used to pass input and output parameters to ebExpire(). */ +typedef struct ExpireInfo { + /* onExpireItem - Called during active-expiration by ebExpire() */ + ExpireAction (*onExpireItem)(eItem item, void *ctx); + + uint64_t maxToExpire; /* [INPUT ] Limit of number expired items to scan */ + void *ctx; /* [INPUT ] context to pass to onExpireItem */ + uint64_t now; /* [INPUT ] Current time in msec. */ + uint64_t itemsExpired; /* [OUTPUT] Returns the number of expired or updated items. */ + uint64_t nextExpireTime; /* [OUTPUT] Next expiration time. Returns + EB_EXPIRE_TIME_INVALID if none left. */ +} ExpireInfo; + +/* ebuckets API */ + +static inline ebuckets ebCreate(void) { return NULL; } /* Empty ebuckets */ + +void ebDestroy(ebuckets *eb, EbucketsType *type, void *deletedItemsCbCtx); + +void ebExpire(ebuckets *eb, EbucketsType *type, ExpireInfo *info); + +uint64_t ebExpireDryRun(ebuckets eb, EbucketsType *type, uint64_t now); + +static inline int ebIsEmpty(ebuckets eb) { return eb == NULL; } + +uint64_t ebGetNextTimeToExpire(ebuckets eb, EbucketsType *type); + +uint64_t ebGetMaxExpireTime(ebuckets eb, EbucketsType *type, int accurate); + +uint64_t ebGetTotalItems(ebuckets eb, EbucketsType *type); + +/* Item related API */ + +int ebRemove(ebuckets *eb, EbucketsType *type, eItem item); + +int ebAdd(ebuckets *eb, EbucketsType *type, eItem item, uint64_t expireTime); + +uint64_t ebGetExpireTime(EbucketsType *type, eItem item); + +typedef eItem (ebDefragFunction)(const eItem item); +eItem ebDefragItem(ebuckets *eb, EbucketsType *type, eItem item, ebDefragFunction *fn); + +static inline uint64_t ebGetMetaExpTime(ExpireMeta *expMeta) { + return (((uint64_t)(expMeta)->expireTimeHi << 32) | (expMeta)->expireTimeLo); +} + +static inline void ebSetMetaExpTime(ExpireMeta *expMeta, uint64_t t) { + expMeta->expireTimeLo = (uint32_t)(t&0xFFFFFFFF); + expMeta->expireTimeHi = (uint16_t)((t) >> 32); +} + +/* Debug API */ + +void ebValidate(ebuckets eb, EbucketsType *type); + +void ebPrint(ebuckets eb, EbucketsType *type); + +#ifdef REDIS_TEST +int ebucketsTest(int argc, char *argv[], int flags); +#endif + +#endif /* __EBUCKETS_H */ diff --git a/src/endianconv.c b/src/endianconv.c index 8eb6b22288d..36673e0d015 100644 --- a/src/endianconv.c +++ b/src/endianconv.c @@ -13,32 +13,11 @@ * * ---------------------------------------------------------------------------- * - * Copyright (c) 2011-2012, Salvatore Sanfilippo + * Copyright (c) 2011-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ diff --git a/src/endianconv.h b/src/endianconv.h index bfe9b7d0acd..469913d1165 100644 --- a/src/endianconv.h +++ b/src/endianconv.h @@ -2,32 +2,11 @@ * * ---------------------------------------------------------------------------- * - * Copyright (c) 2011-2012, Salvatore Sanfilippo + * Copyright (c) 2011-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __ENDIANCONV_H diff --git a/src/eval.c b/src/eval.c index eb4b529368d..1cea9e6db02 100644 --- a/src/eval.c +++ b/src/eval.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2011-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -38,6 +17,9 @@ #include #include #include +#if defined(USE_JEMALLOC) +#include +#endif #include #include @@ -46,6 +28,7 @@ void ldbDisable(client *c); void ldbEnable(client *c); void evalGenericCommandWithDebugging(client *c, int evalsha); sds ldbCatStackValue(sds s, lua_State *lua, int idx); +listNode *luaScriptsLRUAdd(client *c, sds sha, int evalsha); static void dictLuaScriptDestructor(dict *d, void *val) { UNUSED(d); @@ -58,7 +41,7 @@ static uint64_t dictStrCaseHash(const void *key) { return dictGenCaseHashFunction((unsigned char*)key, strlen((char*)key)); } -/* server.lua_scripts sha (as sds string) -> scripts (as luaScript) cache. */ +/* lctx.lua_scripts sha (as sds string) -> scripts (as luaScript) cache. */ dictType shaScriptObjectDictType = { dictStrCaseHash, /* hash function */ NULL, /* key dup */ @@ -74,6 +57,7 @@ struct luaCtx { lua_State *lua; /* The Lua interpreter. We use just one for all clients */ client *lua_client; /* The "fake client" to query Redis from Lua */ dict *lua_scripts; /* A dictionary of SHA1 -> Lua scripts */ + list *lua_scripts_lru_list; /* A list of SHA1, first in first out LRU eviction. */ unsigned long long lua_scripts_mem; /* Cached scripts' memory + oh */ } lctx; @@ -181,18 +165,23 @@ int luaRedisReplicateCommandsCommand(lua_State *lua) { * * However it is simpler to just call scriptingReset() that does just that. */ void scriptingInit(int setup) { - lua_State *lua = lua_open(); - if (setup) { lctx.lua_client = NULL; server.script_disable_deny_script = 0; ldbInit(); } + lua_State *lua = createLuaState(); + if (lua == NULL) { + serverLog(LL_WARNING, "Failed creating the lua VM."); + exit(1); + } + /* Initialize a dictionary we use to map SHAs to scripts. - * This is useful for replication, as we need to replicate EVALSHA - * as EVAL, so we need to remember the associated script. */ + * Initialize a list we use for lua script evictions, it shares the + * sha with the dictionary, so free fn is not set. */ lctx.lua_scripts = dictCreate(&shaScriptObjectDictType); + lctx.lua_scripts_lru_list = listCreate(); lctx.lua_scripts_mem = 0; luaRegisterRedisAPI(lua); @@ -264,15 +253,27 @@ void scriptingInit(int setup) { lctx.lua = lua; } +/* Free lua_scripts dict and close lua interpreter. */ +void freeLuaScriptsSync(dict *lua_scripts, list *lua_scripts_lru_list, lua_State *lua) { + dictRelease(lua_scripts); + listRelease(lua_scripts_lru_list); + lua_close(lua); + +#if defined(USE_JEMALLOC) + /* When lua is closed, destroy the previously used private tcache. */ + void *ud = (global_State*)G(lua)->ud; + unsigned int lua_tcache = (unsigned int)(uintptr_t)ud; + je_mallctl("tcache.destroy", NULL, NULL, (void *)&lua_tcache, sizeof(unsigned int)); +#endif +} + /* Release resources related to Lua scripting. * This function is used in order to reset the scripting environment. */ void scriptingRelease(int async) { if (async) - freeLuaScriptsAsync(lctx.lua_scripts); + freeLuaScriptsAsync(lctx.lua_scripts, lctx.lua_scripts_lru_list, lctx.lua); else - dictRelease(lctx.lua_scripts); - lctx.lua_scripts_mem = 0; - lua_close(lctx.lua); + freeLuaScriptsSync(lctx.lua_scripts, lctx.lua_scripts_lru_list, lctx.lua); } void scriptingReset(int async) { @@ -418,8 +419,11 @@ uint64_t evalGetCommandFlags(client *c, uint64_t cmd_flags) { * exists, and in such a case, it behaves like in the success case. * * If 'c' is not NULL, on error the client is informed with an appropriate - * error describing the nature of the problem and the Lua interpreter error. */ -sds luaCreateFunction(client *c, robj *body) { + * error describing the nature of the problem and the Lua interpreter error. + * + * 'evalsha' indicating whether the lua function is created from the EVAL context + * or from the SCRIPT LOAD. */ +sds luaCreateFunction(client *c, robj *body, int evalsha) { char funcname[43]; dictEntry *de; uint64_t script_flags; @@ -436,7 +440,9 @@ sds luaCreateFunction(client *c, robj *body) { ssize_t shebang_len = 0; sds err = NULL; if (evalExtractShebangFlags(body->ptr, &script_flags, &shebang_len, &err) == C_ERR) { - addReplyErrorSds(c, err); + if (c != NULL) { + addReplyErrorSds(c, err); + } return NULL; } @@ -462,6 +468,7 @@ sds luaCreateFunction(client *c, robj *body) { l->body = body; l->flags = script_flags; sds sha = sdsnewlen(funcname+2,40); + l->node = luaScriptsLRUAdd(c, sha, evalsha); int retval = dictAdd(lctx.lua_scripts,sha,l); serverAssertWithInfo(c ? c : lctx.lua_client,NULL,retval == DICT_OK); lctx.lua_scripts_mem += sdsZmallocSize(sha) + getStringObjectSdsUsedMemory(body); @@ -469,6 +476,63 @@ sds luaCreateFunction(client *c, robj *body) { return sha; } +/* Delete a Lua function with the specified sha. + * + * This will delete the lua function from the lua interpreter and delete + * the lua function from server. */ +void luaDeleteFunction(client *c, sds sha) { + /* Delete the script from lua interpreter. */ + char funcname[43]; + funcname[0] = 'f'; + funcname[1] = '_'; + memcpy(funcname+2, sha, 40); + funcname[42] = '\0'; + lua_pushnil(lctx.lua); + lua_setfield(lctx.lua, LUA_REGISTRYINDEX, funcname); + + /* Delete the script from server. */ + dictEntry *de = dictUnlink(lctx.lua_scripts, sha); + serverAssertWithInfo(c ? c : lctx.lua_client, NULL, de); + luaScript *l = dictGetVal(de); + /* We only delete `EVAL` scripts, which must exist in the LRU list. */ + serverAssert(l->node); + listDelNode(lctx.lua_scripts_lru_list, l->node); + lctx.lua_scripts_mem -= sdsZmallocSize(sha) + getStringObjectSdsUsedMemory(l->body); + dictFreeUnlinkedEntry(lctx.lua_scripts, de); +} + +/* Users who abuse EVAL will generate a new lua script on each call, which can + * consume large amounts of memory over time. Since EVAL is mostly the one that + * abuses the lua cache, and these won't have pipeline issues (scripts won't + * disappear when EVALSHA needs it and cause failure), we implement script eviction + * only for these (not for one loaded with SCRIPT LOAD). Considering that we don't + * have many scripts, then unlike keys, we don't need to worry about the memory + * usage of keeping a true sorted LRU linked list. + * + * 'evalsha' indicating whether the lua function is added from the EVAL context + * or from the SCRIPT LOAD. + * + * Returns the corresponding node added, which is used to save it in luaScript + * and use it for quick removal and re-insertion into an LRU list each time the + * script is used. */ +#define LRU_LIST_LENGTH 500 +listNode *luaScriptsLRUAdd(client *c, sds sha, int evalsha) { + /* Script eviction only applies to EVAL, not SCRIPT LOAD. */ + if (evalsha) return NULL; + + /* Evict oldest. */ + while (listLength(lctx.lua_scripts_lru_list) >= LRU_LIST_LENGTH) { + listNode *ln = listFirst(lctx.lua_scripts_lru_list); + sds oldest = listNodeValue(ln); + luaDeleteFunction(c, oldest); + server.stat_evictedscripts++; + } + + /* Add current. */ + listAddNodeTail(lctx.lua_scripts_lru_list, sha); + return listLast(lctx.lua_scripts_lru_list); +} + void evalGenericCommand(client *c, int evalsha) { lua_State *lua = lctx.lua; char funcname[43]; @@ -507,7 +571,7 @@ void evalGenericCommand(client *c, int evalsha) { addReplyErrorObject(c, shared.noscripterr); return; } - if (luaCreateFunction(c,c->argv[1]) == NULL) { + if (luaCreateFunction(c, c->argv[1], evalsha) == NULL) { lua_pop(lua,1); /* remove the error handler from the stack. */ /* The error is sent to the client by luaCreateFunction() * itself when it returns NULL. */ @@ -536,6 +600,13 @@ void evalGenericCommand(client *c, int evalsha) { luaCallFunction(&rctx, lua, c->argv+3, numkeys, c->argv+3+numkeys, c->argc-3-numkeys, ldb.active); lua_pop(lua,1); /* Remove the error handler. */ scriptResetRun(&rctx); + + if (l->node) { + /* Quick removal and re-insertion after the script is called to + * maintain the LRU list. */ + listUnlinkNode(lctx.lua_scripts_lru_list, l->node); + listLinkNodeTail(lctx.lua_scripts_lru_list, l->node); + } } void evalCommand(client *c) { @@ -621,7 +692,7 @@ NULL addReply(c,shared.czero); } } else if (c->argc == 3 && !strcasecmp(c->argv[1]->ptr,"load")) { - sds sha = luaCreateFunction(c,c->argv[2]); + sds sha = luaCreateFunction(c, c->argv[2], 1); if (sha == NULL) return; /* The error was sent by luaCreateFunction(). */ addReplyBulkCBuffer(c,sha,40); } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"kill")) { @@ -661,7 +732,8 @@ dict* evalScriptsDict(void) { unsigned long evalScriptsMemory(void) { return lctx.lua_scripts_mem + dictMemUsage(lctx.lua_scripts) + - dictSize(lctx.lua_scripts) * sizeof(luaScript); + dictSize(lctx.lua_scripts) * sizeof(luaScript) + + listLength(lctx.lua_scripts_lru_list) * sizeof(listNode); } /* --------------------------------------------------------------------------- @@ -1665,3 +1737,7 @@ void luaLdbLineHook(lua_State *lua, lua_Debug *ar) { rctx->start_time = getMonotonicUs(); } } + +dict *getLuaScripts(void) { + return lctx.lua_scripts; +} diff --git a/src/evict.c b/src/evict.c index 909714b4304..890a845d5df 100644 --- a/src/evict.c +++ b/src/evict.c @@ -2,32 +2,11 @@ * * ---------------------------------------------------------------------------- * - * Copyright (c) 2009-2016, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -58,6 +37,7 @@ struct evictionPoolEntry { sds key; /* Key name. */ sds cached; /* Cached SDS object for key name. */ int dbid; /* Key DB number. */ + int slot; /* Slot. */ }; static struct evictionPoolEntry *EvictionPoolLRU; @@ -142,12 +122,12 @@ void evictionPoolAlloc(void) { * We insert keys on place in ascending order, so keys with the smaller * idle time are on the left, and keys with the higher idle time on the * right. */ - -void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evictionPoolEntry *pool) { +int evictionPoolPopulate(redisDb *db, kvstore *samplekvs, struct evictionPoolEntry *pool) { int j, k, count; dictEntry *samples[server.maxmemory_samples]; - count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples); + int slot = kvstoreGetFairRandomDictIndex(samplekvs); + count = kvstoreDictGetSomeKeys(samplekvs,slot,samples,server.maxmemory_samples); for (j = 0; j < count; j++) { unsigned long long idle; sds key; @@ -161,13 +141,14 @@ void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evic * dictionary (but the expires one) we need to lookup the key * again in the key dictionary to obtain the value object. */ if (server.maxmemory_policy != MAXMEMORY_VOLATILE_TTL) { - if (sampledict != keydict) de = dictFind(keydict, key); + if (samplekvs != db->keys) + de = kvstoreDictFind(db->keys, slot, key); o = dictGetVal(de); } /* Calculate the idle time according to the policy. This is called * idle just because the code initially handled LRU, but is in fact - * just a score where an higher score means better candidate. */ + * just a score where a higher score means better candidate. */ if (server.maxmemory_policy & MAXMEMORY_FLAG_LRU) { idle = estimateObjectIdleTime(o); } else if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) { @@ -236,8 +217,11 @@ void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evic pool[k].key = pool[k].cached; } pool[k].idle = idle; - pool[k].dbid = dbid; + pool[k].dbid = db->id; + pool[k].slot = slot; } + + return count; } /* ---------------------------------------------------------------------------- @@ -249,42 +233,40 @@ void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evic * * We split the 24 bits into two fields: * - * 16 bits 8 bits - * +----------------+--------+ - * + Last decr time | LOG_C | - * +----------------+--------+ + * 16 bits 8 bits + * +------------------+--------+ + * + Last access time | LOG_C | + * +------------------+--------+ * * LOG_C is a logarithmic counter that provides an indication of the access * frequency. However this field must also be decremented otherwise what used * to be a frequently accessed key in the past, will remain ranked like that * forever, while we want the algorithm to adapt to access pattern changes. * - * So the remaining 16 bits are used in order to store the "decrement time", + * So the remaining 16 bits are used in order to store the "access time", * a reduced-precision Unix time (we take 16 bits of the time converted * in minutes since we don't care about wrapping around) where the LOG_C - * counter is halved if it has an high value, or just decremented if it - * has a low value. + * counter decays every minute by default (depends on lfu-decay-time). * * New keys don't start at zero, in order to have the ability to collect * some accesses before being trashed away, so they start at LFU_INIT_VAL. * The logarithmic increment performed on LOG_C takes care of LFU_INIT_VAL * when incrementing the key, so that keys starting at LFU_INIT_VAL * (or having a smaller value) have a very high chance of being incremented - * on access. + * on access. (The chance depends on counter and lfu-log-factor.) * - * During decrement, the value of the logarithmic counter is halved if - * its current value is greater than two times the LFU_INIT_VAL, otherwise - * it is just decremented by one. + * During decrement, the value of the logarithmic counter is decremented by + * one when lfu-decay-time minutes elapsed. * --------------------------------------------------------------------------*/ /* Return the current time in minutes, just taking the least significant - * 16 bits. The returned time is suitable to be stored as LDT (last decrement + * 16 bits. The returned time is suitable to be stored as LDT (last access * time) for the LFU implementation. */ unsigned long LFUGetTimeInMinutes(void) { return (server.unixtime/60) & 65535; } -/* Given an object last access time, compute the minimum number of minutes +/* Given an object ldt (last access time), compute the minimum number of minutes * that elapsed since the last access. Handle overflow (ldt greater than * the current 16 bits minutes time) considering the time as wrapping * exactly once. */ @@ -306,10 +288,10 @@ uint8_t LFULogIncr(uint8_t counter) { return counter; } -/* If the object decrement time is reached decrement the LFU counter but +/* If the object's ldt (last access time) is reached, decrement the LFU counter but * do not update LFU fields of the object, we update the access time * and counter in an explicit way when the object is really accessed. - * And we will times halve the counter according to the times of + * And we will decrement the counter according to the times of * elapsed time than server.lfu_decay_time. * Return the object frequency counter. * @@ -569,6 +551,7 @@ int performEvictions(void) { /* Try to smoke-out bugs (server.also_propagate should be empty here) */ serverAssert(server.also_propagate.numops == 0); + /* Evictions are performed on random keys that have nothing to do with the current command slot. */ while (mem_freed < (long long)mem_tofree) { int j, k, i; @@ -576,27 +559,43 @@ int performEvictions(void) { sds bestkey = NULL; int bestdbid; redisDb *db; - dict *dict; dictEntry *de; if (server.maxmemory_policy & (MAXMEMORY_FLAG_LRU|MAXMEMORY_FLAG_LFU) || server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) { struct evictionPoolEntry *pool = EvictionPoolLRU; - while (bestkey == NULL) { - unsigned long total_keys = 0, keys; + unsigned long total_keys = 0; /* We don't want to make local-db choices when expiring keys, * so to start populate the eviction pool sampling keys from * every DB. */ for (i = 0; i < server.dbnum; i++) { db = server.db+i; - dict = (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) ? - db->dict : db->expires; - if ((keys = dictSize(dict)) != 0) { - evictionPoolPopulate(i, dict, db->dict, pool); - total_keys += keys; + kvstore *kvs; + if (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) { + kvs = db->keys; + } else { + kvs = db->expires; + } + unsigned long sampled_keys = 0; + unsigned long current_db_keys = kvstoreSize(kvs); + if (current_db_keys == 0) continue; + + total_keys += current_db_keys; + int l = kvstoreNumNonEmptyDicts(kvs); + /* Do not exceed the number of non-empty slots when looping. */ + while (l--) { + sampled_keys += evictionPoolPopulate(db, kvs, pool); + /* We have sampled enough keys in the current db, exit the loop. */ + if (sampled_keys >= (unsigned long) server.maxmemory_samples) + break; + /* If there are not a lot of keys in the current db, dict/s may be very + * sparsely populated, exit the loop without meeting the sampling + * requirement. */ + if (current_db_keys < (unsigned long) server.maxmemory_samples*10) + break; } } if (!total_keys) break; /* No keys to evict. */ @@ -606,13 +605,13 @@ int performEvictions(void) { if (pool[k].key == NULL) continue; bestdbid = pool[k].dbid; + kvstore *kvs; if (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) { - de = dictFind(server.db[bestdbid].dict, - pool[k].key); + kvs = server.db[bestdbid].keys; } else { - de = dictFind(server.db[bestdbid].expires, - pool[k].key); + kvs = server.db[bestdbid].expires; } + de = kvstoreDictFind(kvs, pool[k].slot, pool[k].key); /* Remove the entry from the pool. */ if (pool[k].key != pool[k].cached) @@ -642,10 +641,15 @@ int performEvictions(void) { for (i = 0; i < server.dbnum; i++) { j = (++next_db) % server.dbnum; db = server.db+j; - dict = (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) ? - db->dict : db->expires; - if (dictSize(dict) != 0) { - de = dictGetRandomKey(dict); + kvstore *kvs; + if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) { + kvs = db->keys; + } else { + kvs = db->expires; + } + int slot = kvstoreGetFairRandomDictIndex(kvs); + de = kvstoreDictGetRandomKey(kvs, slot); + if (de) { bestkey = dictGetKey(de); bestdbid = j; break; @@ -667,6 +671,7 @@ int performEvictions(void) { * * AOF and Output buffer memory will be freed eventually so * we only care about memory used by the key space. */ + enterExecutionUnit(1, 0); delta = (long long) zmalloc_used_memory(); latencyStartMonitor(eviction_latency); dbGenericDelete(db,keyobj,server.lazyfree_lazy_eviction,DB_FLAG_KEY_EVICTED); @@ -679,6 +684,7 @@ int performEvictions(void) { notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted", keyobj, db->id); propagateDeletion(db,keyobj,server.lazyfree_lazy_eviction); + exitExecutionUnit(); postExecutionUnitOperations(); decrRefCount(keyobj); keys_freed++; diff --git a/src/expire.c b/src/expire.c index 425491af6bc..646f752a9c4 100644 --- a/src/expire.c +++ b/src/expire.c @@ -2,32 +2,11 @@ * * ---------------------------------------------------------------------------- * - * Copyright (c) 2009-2016, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -40,6 +19,10 @@ * if no access is performed on them. *----------------------------------------------------------------------------*/ +/* Constants table from pow(0.98, 1) to pow(0.98, 16). + * Help calculating the db->avg_ttl. */ +static double avg_ttl_factor[16] = {0.98, 0.9604, 0.941192, 0.922368, 0.903921, 0.885842, 0.868126, 0.850763, 0.833748, 0.817073, 0.800731, 0.784717, 0.769022, 0.753642, 0.738569, 0.723798}; + /* Helper function for the activeExpireCycle() function. * This function will try to expire the key that is stored in the hash table * entry 'de' of the 'expires' hash table of a Redis database. @@ -54,10 +37,12 @@ int activeExpireCycleTryExpire(redisDb *db, dictEntry *de, long long now) { long long t = dictGetSignedIntegerVal(de); if (now > t) { + enterExecutionUnit(1, 0); sds key = dictGetKey(de); robj *keyobj = createStringObject(key,sdslen(key)); deleteExpiredKeyAndPropagate(db,keyobj); decrRefCount(keyobj); + exitExecutionUnit(); return 1; } else { return 0; @@ -109,6 +94,7 @@ int activeExpireCycleTryExpire(redisDb *db, dictEntry *de, long long now) { #define ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC 25 /* Max % of CPU to use. */ #define ACTIVE_EXPIRE_CYCLE_ACCEPTABLE_STALE 10 /* % of stale keys after which we do extra efforts. */ +#define HFE_ACTIVE_EXPIRE_CYCLE_FIELDS 1000 /* Data used by the expire dict scan callback. */ typedef struct { @@ -137,6 +123,65 @@ void expireScanCallback(void *privdata, const dictEntry *const_de) { data->sampled++; } +static inline int isExpiryDictValidForSamplingCb(dict *d) { + long long numkeys = dictSize(d); + unsigned long buckets = dictBuckets(d); + /* When there are less than 1% filled buckets, sampling the key + * space is expensive, so stop here waiting for better times... + * The dictionary will be resized asap. */ + if (buckets > DICT_HT_INITIAL_SIZE && (numkeys * 100/buckets < 1)) { + return C_ERR; + } + return C_OK; +} + +/* Active expiration Cycle for hash-fields. + * + * Note that releasing fields is expected to be more predictable and rewarding + * than releasing keys because it is stored in `ebuckets` DS which optimized for + * active expiration and in addition the deletion of fields is simple to handle. */ +static inline void activeExpireHashFieldCycle(int type) { + /* Remember current db across calls */ + static unsigned int currentDb = 0; + + /* Tracks the count of fields actively expired for the current database. + * This count continues as long as it fails to actively expire all expired + * fields of currentDb, indicating a possible need to adjust the value of + * maxToExpire. */ + static uint64_t activeExpirySequence = 0; + /* Threshold for adjusting maxToExpire */ + const uint32_t EXPIRED_FIELDS_TH = 1000000; + /* Maximum number of fields to actively expire in a single call */ + uint32_t maxToExpire = HFE_ACTIVE_EXPIRE_CYCLE_FIELDS; + + redisDb *db = server.db + currentDb; + + /* If db is empty, move to next db and return */ + if (ebIsEmpty(db->hexpires)) { + activeExpirySequence = 0; + currentDb = (currentDb + 1) % server.dbnum; + return; + } + + /* If running for a while and didn't manage to active-expire all expired fields of + * currentDb (i.e. activeExpirySequence becomes significant) then adjust maxToExpire */ + if ((activeExpirySequence > EXPIRED_FIELDS_TH) && (type == ACTIVE_EXPIRE_CYCLE_SLOW)) { + /* maxToExpire is multiplied by a factor between 1 and 32, proportional to + * the number of times activeExpirySequence exceeded EXPIRED_FIELDS_TH */ + uint64_t factor = activeExpirySequence / EXPIRED_FIELDS_TH; + maxToExpire *= (factor<32) ? factor : 32; + } + + if (hashTypeDbActiveExpire(db, maxToExpire) == maxToExpire) { + /* active-expire reached maxToExpire limit */ + activeExpirySequence += maxToExpire; + } else { + /* Managed to active-expire all expired fields of currentDb */ + activeExpirySequence = 0; + currentDb = (currentDb + 1) % server.dbnum; + } +} + void activeExpireCycle(int type) { /* Adjust the running parameters according to the configured expire * effort. The default effort is 1, and the maximum configurable effort @@ -160,6 +205,7 @@ void activeExpireCycle(int type) { int j, iteration = 0; int dbs_per_call = CRON_DBS_PER_CALL; + int dbs_performed = 0; long long start = ustime(), timelimit, elapsed; /* If 'expire' action is paused, for whatever reason, then don't expire any key. @@ -212,46 +258,55 @@ void activeExpireCycle(int type) { /* Try to smoke-out bugs (server.also_propagate should be empty here) */ serverAssert(server.also_propagate.numops == 0); - for (j = 0; j < dbs_per_call && timelimit_exit == 0; j++) { + /* Stop iteration when one of the following conditions is met: + * + * 1) We have checked a sufficient number of databases with expiration time. + * 2) The time limit has been exceeded. + * 3) All databases have been traversed. */ + for (j = 0; dbs_performed < dbs_per_call && timelimit_exit == 0 && j < server.dbnum; j++) { /* Scan callback data including expired and checked count per iteration. */ expireScanData data; + data.ttl_sum = 0; + data.ttl_samples = 0; redisDb *db = server.db+(current_db % server.dbnum); data.db = db; + int db_done = 0; /* The scan of the current DB is done? */ + int update_avg_ttl_times = 0, repeat = 0; + /* Increment the DB now so we are sure if we run out of time * in the current DB we'll restart from the next. This allows to * distribute the time evenly across DBs. */ current_db++; + /* Interleaving hash-field expiration with key expiration. Better + * call it before handling expired keys because HFE DS is optimized for + * active expiration */ + activeExpireHashFieldCycle(type); + + if (kvstoreSize(db->expires)) + dbs_performed++; + /* Continue to expire if at the end of the cycle there are still * a big percentage of keys to expire, compared to the number of keys * we scanned. The percentage, stored in config_cycle_acceptable_stale * is not fixed, but depends on the Redis configured "expire effort". */ do { - unsigned long num, slots; + unsigned long num; iteration++; /* If there is nothing to expire try next DB ASAP. */ - if ((num = dictSize(db->expires)) == 0) { + if ((num = kvstoreSize(db->expires)) == 0) { db->avg_ttl = 0; break; } - slots = dictSlots(db->expires); data.now = mstime(); - /* When there are less than 1% filled slots, sampling the key - * space is expensive, so stop here waiting for better times... - * The dictionary will be resized asap. */ - if (slots > DICT_HT_INITIAL_SIZE && - (num*100/slots < 1)) break; - /* The main collection cycle. Scan through keys among keys * with an expire set, checking for expired ones. */ data.sampled = 0; data.expired = 0; - data.ttl_sum = 0; - data.ttl_samples = 0; if (num > config_keys_per_loop) num = config_keys_per_loop; @@ -269,41 +324,70 @@ void activeExpireCycle(int type) { long max_buckets = num*20; long checked_buckets = 0; + int origin_ttl_samples = data.ttl_samples; + while (data.sampled < num && checked_buckets < max_buckets) { - db->expires_cursor = dictScan(db->expires, db->expires_cursor, - expireScanCallback, &data); + db->expires_cursor = kvstoreScan(db->expires, db->expires_cursor, -1, expireScanCallback, isExpiryDictValidForSamplingCb, &data); + if (db->expires_cursor == 0) { + db_done = 1; + break; + } checked_buckets++; } total_expired += data.expired; total_sampled += data.sampled; - /* Update the average TTL stats for this database. */ - if (data.ttl_samples) { - long long avg_ttl = data.ttl_sum / data.ttl_samples; + /* If find keys with ttl not yet expired, we need to update the average TTL stats once. */ + if (data.ttl_samples - origin_ttl_samples > 0) update_avg_ttl_times++; - /* Do a simple running average with a few samples. - * We just use the current estimate with a weight of 2% - * and the previous estimate with a weight of 98%. */ - if (db->avg_ttl == 0) db->avg_ttl = avg_ttl; - db->avg_ttl = (db->avg_ttl/50)*49 + (avg_ttl/50); - } + /* We don't repeat the cycle for the current database if the db is done + * for scanning or an acceptable number of stale keys (logically expired + * but yet not reclaimed). */ + repeat = db_done ? 0 : (data.sampled == 0 || (data.expired * 100 / data.sampled) > config_cycle_acceptable_stale); /* We can't block forever here even if there are many keys to - * expire. So after a given amount of milliseconds return to the + * expire. So after a given amount of microseconds return to the * caller waiting for the other active expire cycle. */ - if ((iteration & 0xf) == 0) { /* check once every 16 iterations. */ - elapsed = ustime()-start; - if (elapsed > timelimit) { - timelimit_exit = 1; - server.stat_expired_time_cap_reached_count++; - break; + if ((iteration & 0xf) == 0 || !repeat) { /* Update the average TTL stats every 16 iterations or about to exit. */ + /* Update the average TTL stats for this database, + * because this may reach the time limit. */ + if (data.ttl_samples) { + long long avg_ttl = data.ttl_sum / data.ttl_samples; + + /* Do a simple running average with a few samples. + * We just use the current estimate with a weight of 2% + * and the previous estimate with a weight of 98%. */ + if (db->avg_ttl == 0) { + db->avg_ttl = avg_ttl; + } else { + /* The origin code is as follow. + * for (int i = 0; i < update_avg_ttl_times; i++) { + * db->avg_ttl = (db->avg_ttl/50)*49 + (avg_ttl/50); + * } + * We can convert the loop into a sum of a geometric progression. + * db->avg_ttl = db->avg_ttl * pow(0.98, update_avg_ttl_times) + + * avg_ttl / 50 * (pow(0.98, update_avg_ttl_times - 1) + ... + 1) + * = db->avg_ttl * pow(0.98, update_avg_ttl_times) + + * avg_ttl * (1 - pow(0.98, update_avg_ttl_times)) + * = avg_ttl + (db->avg_ttl - avg_ttl) * pow(0.98, update_avg_ttl_times) + * Notice that update_avg_ttl_times is between 1 and 16, we use a constant table + * to accelerate the calculation of pow(0.98, update_avg_ttl_times).*/ + db->avg_ttl = avg_ttl + (db->avg_ttl - avg_ttl) * avg_ttl_factor[update_avg_ttl_times - 1] ; + } + update_avg_ttl_times = 0; + data.ttl_sum = 0; + data.ttl_samples = 0; + } + if ((iteration & 0xf) == 0) { /* check time limit every 16 iterations. */ + elapsed = ustime()-start; + if (elapsed > timelimit) { + timelimit_exit = 1; + server.stat_expired_time_cap_reached_count++; + break; + } } } - /* We don't repeat the cycle for the current database if there are - * an acceptable amount of stale keys (logically expired but yet - * not reclaimed). */ - } while (data.sampled == 0 || - (data.expired * 100 / data.sampled) > config_cycle_acceptable_stale); + } while (repeat); } elapsed = ustime()-start; @@ -378,7 +462,7 @@ void expireSlaveKeys(void) { while(dbids && dbid < server.dbnum) { if ((dbids & 1) != 0) { redisDb *db = server.db+dbid; - dictEntry *expire = dictFind(db->expires,keyname); + dictEntry *expire = dbFindExpires(db, keyname); int expired = 0; if (expire && diff --git a/src/fmacros.h b/src/fmacros.h index c5da4b7345a..92791cbcec7 100644 --- a/src/fmacros.h +++ b/src/fmacros.h @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef _REDIS_FMACRO_H diff --git a/src/fmtargs.h b/src/fmtargs.h new file mode 100644 index 00000000000..e52d3b99c50 --- /dev/null +++ b/src/fmtargs.h @@ -0,0 +1,173 @@ +/* + * Copyright Redis Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + * + * To make it easier to map each part of the format string with each argument, + * this file provides a way to write + * + * printf("a = %s, b = %s, c = %s\n", + * arg1, arg2, arg3); + * + * as + * + * printf(FMTARGS("a = %s, ", arg1, + * "b = %s, ", arg2, + * "c = %s\n", arg3)); + * + * FMTARGS is variadic macro which is implemented by passing on its arguments to + * two other variadic macros of which one extracts the odd (the formats) and the + * other extracts the even (the arguments). The definitions of these macros + * include counting the number of macro arguments. Therefore, they don't accept + * an unlimited number of arguments. Currently it is fixed to a maximum of 120 + * formats and arguments. + */ +#ifndef FMTARGS_H +#define FMTARGS_H + +/* A macro to count the number of arguments. */ +#define NARG(...) NARG_I(__VA_ARGS__,RSEQ_N()) +#define NARG_I(...) ARG_N(__VA_ARGS__) + +/* Define a macro which will call an arbitrary macro appended with a number indicating + * the number of arguments it has. */ +#define VFUNC_N_(name, n) name##n +#define VFUNC_N(name, n) VFUNC_N_(name, n) +#define VFUNC(func, ...) VFUNC_N(func, NARG(__VA_ARGS__)) (__VA_ARGS__) + +/* Macros to extract the formats and the arguments from the fmt-arg pairs and + * then combine them again with all formats first and the arguments last. */ +#define COMPACT_FMT(...) VFUNC(COMPACT_FMT_, __VA_ARGS__) +#define COMPACT_VALUES(...) VFUNC(COMPACT_VALUES_, __VA_ARGS__) +#define FMTARGS(...) COMPACT_FMT(__VA_ARGS__), COMPACT_VALUES(__VA_ARGS__) + +/* Everything below this line is automatically generated by + * generate-fmtargs.py. Do not manually edit. */ + +#define ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, _64, _65, _66, _67, _68, _69, _70, _71, _72, _73, _74, _75, _76, _77, _78, _79, _80, _81, _82, _83, _84, _85, _86, _87, _88, _89, _90, _91, _92, _93, _94, _95, _96, _97, _98, _99, _100, _101, _102, _103, _104, _105, _106, _107, _108, _109, _110, _111, _112, _113, _114, _115, _116, _117, _118, _119, _120, N, ...) N + +#define RSEQ_N() 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +#define COMPACT_FMT_2(fmt, value) fmt +#define COMPACT_FMT_4(fmt, value, ...) fmt COMPACT_FMT_2(__VA_ARGS__) +#define COMPACT_FMT_6(fmt, value, ...) fmt COMPACT_FMT_4(__VA_ARGS__) +#define COMPACT_FMT_8(fmt, value, ...) fmt COMPACT_FMT_6(__VA_ARGS__) +#define COMPACT_FMT_10(fmt, value, ...) fmt COMPACT_FMT_8(__VA_ARGS__) +#define COMPACT_FMT_12(fmt, value, ...) fmt COMPACT_FMT_10(__VA_ARGS__) +#define COMPACT_FMT_14(fmt, value, ...) fmt COMPACT_FMT_12(__VA_ARGS__) +#define COMPACT_FMT_16(fmt, value, ...) fmt COMPACT_FMT_14(__VA_ARGS__) +#define COMPACT_FMT_18(fmt, value, ...) fmt COMPACT_FMT_16(__VA_ARGS__) +#define COMPACT_FMT_20(fmt, value, ...) fmt COMPACT_FMT_18(__VA_ARGS__) +#define COMPACT_FMT_22(fmt, value, ...) fmt COMPACT_FMT_20(__VA_ARGS__) +#define COMPACT_FMT_24(fmt, value, ...) fmt COMPACT_FMT_22(__VA_ARGS__) +#define COMPACT_FMT_26(fmt, value, ...) fmt COMPACT_FMT_24(__VA_ARGS__) +#define COMPACT_FMT_28(fmt, value, ...) fmt COMPACT_FMT_26(__VA_ARGS__) +#define COMPACT_FMT_30(fmt, value, ...) fmt COMPACT_FMT_28(__VA_ARGS__) +#define COMPACT_FMT_32(fmt, value, ...) fmt COMPACT_FMT_30(__VA_ARGS__) +#define COMPACT_FMT_34(fmt, value, ...) fmt COMPACT_FMT_32(__VA_ARGS__) +#define COMPACT_FMT_36(fmt, value, ...) fmt COMPACT_FMT_34(__VA_ARGS__) +#define COMPACT_FMT_38(fmt, value, ...) fmt COMPACT_FMT_36(__VA_ARGS__) +#define COMPACT_FMT_40(fmt, value, ...) fmt COMPACT_FMT_38(__VA_ARGS__) +#define COMPACT_FMT_42(fmt, value, ...) fmt COMPACT_FMT_40(__VA_ARGS__) +#define COMPACT_FMT_44(fmt, value, ...) fmt COMPACT_FMT_42(__VA_ARGS__) +#define COMPACT_FMT_46(fmt, value, ...) fmt COMPACT_FMT_44(__VA_ARGS__) +#define COMPACT_FMT_48(fmt, value, ...) fmt COMPACT_FMT_46(__VA_ARGS__) +#define COMPACT_FMT_50(fmt, value, ...) fmt COMPACT_FMT_48(__VA_ARGS__) +#define COMPACT_FMT_52(fmt, value, ...) fmt COMPACT_FMT_50(__VA_ARGS__) +#define COMPACT_FMT_54(fmt, value, ...) fmt COMPACT_FMT_52(__VA_ARGS__) +#define COMPACT_FMT_56(fmt, value, ...) fmt COMPACT_FMT_54(__VA_ARGS__) +#define COMPACT_FMT_58(fmt, value, ...) fmt COMPACT_FMT_56(__VA_ARGS__) +#define COMPACT_FMT_60(fmt, value, ...) fmt COMPACT_FMT_58(__VA_ARGS__) +#define COMPACT_FMT_62(fmt, value, ...) fmt COMPACT_FMT_60(__VA_ARGS__) +#define COMPACT_FMT_64(fmt, value, ...) fmt COMPACT_FMT_62(__VA_ARGS__) +#define COMPACT_FMT_66(fmt, value, ...) fmt COMPACT_FMT_64(__VA_ARGS__) +#define COMPACT_FMT_68(fmt, value, ...) fmt COMPACT_FMT_66(__VA_ARGS__) +#define COMPACT_FMT_70(fmt, value, ...) fmt COMPACT_FMT_68(__VA_ARGS__) +#define COMPACT_FMT_72(fmt, value, ...) fmt COMPACT_FMT_70(__VA_ARGS__) +#define COMPACT_FMT_74(fmt, value, ...) fmt COMPACT_FMT_72(__VA_ARGS__) +#define COMPACT_FMT_76(fmt, value, ...) fmt COMPACT_FMT_74(__VA_ARGS__) +#define COMPACT_FMT_78(fmt, value, ...) fmt COMPACT_FMT_76(__VA_ARGS__) +#define COMPACT_FMT_80(fmt, value, ...) fmt COMPACT_FMT_78(__VA_ARGS__) +#define COMPACT_FMT_82(fmt, value, ...) fmt COMPACT_FMT_80(__VA_ARGS__) +#define COMPACT_FMT_84(fmt, value, ...) fmt COMPACT_FMT_82(__VA_ARGS__) +#define COMPACT_FMT_86(fmt, value, ...) fmt COMPACT_FMT_84(__VA_ARGS__) +#define COMPACT_FMT_88(fmt, value, ...) fmt COMPACT_FMT_86(__VA_ARGS__) +#define COMPACT_FMT_90(fmt, value, ...) fmt COMPACT_FMT_88(__VA_ARGS__) +#define COMPACT_FMT_92(fmt, value, ...) fmt COMPACT_FMT_90(__VA_ARGS__) +#define COMPACT_FMT_94(fmt, value, ...) fmt COMPACT_FMT_92(__VA_ARGS__) +#define COMPACT_FMT_96(fmt, value, ...) fmt COMPACT_FMT_94(__VA_ARGS__) +#define COMPACT_FMT_98(fmt, value, ...) fmt COMPACT_FMT_96(__VA_ARGS__) +#define COMPACT_FMT_100(fmt, value, ...) fmt COMPACT_FMT_98(__VA_ARGS__) +#define COMPACT_FMT_102(fmt, value, ...) fmt COMPACT_FMT_100(__VA_ARGS__) +#define COMPACT_FMT_104(fmt, value, ...) fmt COMPACT_FMT_102(__VA_ARGS__) +#define COMPACT_FMT_106(fmt, value, ...) fmt COMPACT_FMT_104(__VA_ARGS__) +#define COMPACT_FMT_108(fmt, value, ...) fmt COMPACT_FMT_106(__VA_ARGS__) +#define COMPACT_FMT_110(fmt, value, ...) fmt COMPACT_FMT_108(__VA_ARGS__) +#define COMPACT_FMT_112(fmt, value, ...) fmt COMPACT_FMT_110(__VA_ARGS__) +#define COMPACT_FMT_114(fmt, value, ...) fmt COMPACT_FMT_112(__VA_ARGS__) +#define COMPACT_FMT_116(fmt, value, ...) fmt COMPACT_FMT_114(__VA_ARGS__) +#define COMPACT_FMT_118(fmt, value, ...) fmt COMPACT_FMT_116(__VA_ARGS__) +#define COMPACT_FMT_120(fmt, value, ...) fmt COMPACT_FMT_118(__VA_ARGS__) + +#define COMPACT_VALUES_2(fmt, value) value +#define COMPACT_VALUES_4(fmt, value, ...) value, COMPACT_VALUES_2(__VA_ARGS__) +#define COMPACT_VALUES_6(fmt, value, ...) value, COMPACT_VALUES_4(__VA_ARGS__) +#define COMPACT_VALUES_8(fmt, value, ...) value, COMPACT_VALUES_6(__VA_ARGS__) +#define COMPACT_VALUES_10(fmt, value, ...) value, COMPACT_VALUES_8(__VA_ARGS__) +#define COMPACT_VALUES_12(fmt, value, ...) value, COMPACT_VALUES_10(__VA_ARGS__) +#define COMPACT_VALUES_14(fmt, value, ...) value, COMPACT_VALUES_12(__VA_ARGS__) +#define COMPACT_VALUES_16(fmt, value, ...) value, COMPACT_VALUES_14(__VA_ARGS__) +#define COMPACT_VALUES_18(fmt, value, ...) value, COMPACT_VALUES_16(__VA_ARGS__) +#define COMPACT_VALUES_20(fmt, value, ...) value, COMPACT_VALUES_18(__VA_ARGS__) +#define COMPACT_VALUES_22(fmt, value, ...) value, COMPACT_VALUES_20(__VA_ARGS__) +#define COMPACT_VALUES_24(fmt, value, ...) value, COMPACT_VALUES_22(__VA_ARGS__) +#define COMPACT_VALUES_26(fmt, value, ...) value, COMPACT_VALUES_24(__VA_ARGS__) +#define COMPACT_VALUES_28(fmt, value, ...) value, COMPACT_VALUES_26(__VA_ARGS__) +#define COMPACT_VALUES_30(fmt, value, ...) value, COMPACT_VALUES_28(__VA_ARGS__) +#define COMPACT_VALUES_32(fmt, value, ...) value, COMPACT_VALUES_30(__VA_ARGS__) +#define COMPACT_VALUES_34(fmt, value, ...) value, COMPACT_VALUES_32(__VA_ARGS__) +#define COMPACT_VALUES_36(fmt, value, ...) value, COMPACT_VALUES_34(__VA_ARGS__) +#define COMPACT_VALUES_38(fmt, value, ...) value, COMPACT_VALUES_36(__VA_ARGS__) +#define COMPACT_VALUES_40(fmt, value, ...) value, COMPACT_VALUES_38(__VA_ARGS__) +#define COMPACT_VALUES_42(fmt, value, ...) value, COMPACT_VALUES_40(__VA_ARGS__) +#define COMPACT_VALUES_44(fmt, value, ...) value, COMPACT_VALUES_42(__VA_ARGS__) +#define COMPACT_VALUES_46(fmt, value, ...) value, COMPACT_VALUES_44(__VA_ARGS__) +#define COMPACT_VALUES_48(fmt, value, ...) value, COMPACT_VALUES_46(__VA_ARGS__) +#define COMPACT_VALUES_50(fmt, value, ...) value, COMPACT_VALUES_48(__VA_ARGS__) +#define COMPACT_VALUES_52(fmt, value, ...) value, COMPACT_VALUES_50(__VA_ARGS__) +#define COMPACT_VALUES_54(fmt, value, ...) value, COMPACT_VALUES_52(__VA_ARGS__) +#define COMPACT_VALUES_56(fmt, value, ...) value, COMPACT_VALUES_54(__VA_ARGS__) +#define COMPACT_VALUES_58(fmt, value, ...) value, COMPACT_VALUES_56(__VA_ARGS__) +#define COMPACT_VALUES_60(fmt, value, ...) value, COMPACT_VALUES_58(__VA_ARGS__) +#define COMPACT_VALUES_62(fmt, value, ...) value, COMPACT_VALUES_60(__VA_ARGS__) +#define COMPACT_VALUES_64(fmt, value, ...) value, COMPACT_VALUES_62(__VA_ARGS__) +#define COMPACT_VALUES_66(fmt, value, ...) value, COMPACT_VALUES_64(__VA_ARGS__) +#define COMPACT_VALUES_68(fmt, value, ...) value, COMPACT_VALUES_66(__VA_ARGS__) +#define COMPACT_VALUES_70(fmt, value, ...) value, COMPACT_VALUES_68(__VA_ARGS__) +#define COMPACT_VALUES_72(fmt, value, ...) value, COMPACT_VALUES_70(__VA_ARGS__) +#define COMPACT_VALUES_74(fmt, value, ...) value, COMPACT_VALUES_72(__VA_ARGS__) +#define COMPACT_VALUES_76(fmt, value, ...) value, COMPACT_VALUES_74(__VA_ARGS__) +#define COMPACT_VALUES_78(fmt, value, ...) value, COMPACT_VALUES_76(__VA_ARGS__) +#define COMPACT_VALUES_80(fmt, value, ...) value, COMPACT_VALUES_78(__VA_ARGS__) +#define COMPACT_VALUES_82(fmt, value, ...) value, COMPACT_VALUES_80(__VA_ARGS__) +#define COMPACT_VALUES_84(fmt, value, ...) value, COMPACT_VALUES_82(__VA_ARGS__) +#define COMPACT_VALUES_86(fmt, value, ...) value, COMPACT_VALUES_84(__VA_ARGS__) +#define COMPACT_VALUES_88(fmt, value, ...) value, COMPACT_VALUES_86(__VA_ARGS__) +#define COMPACT_VALUES_90(fmt, value, ...) value, COMPACT_VALUES_88(__VA_ARGS__) +#define COMPACT_VALUES_92(fmt, value, ...) value, COMPACT_VALUES_90(__VA_ARGS__) +#define COMPACT_VALUES_94(fmt, value, ...) value, COMPACT_VALUES_92(__VA_ARGS__) +#define COMPACT_VALUES_96(fmt, value, ...) value, COMPACT_VALUES_94(__VA_ARGS__) +#define COMPACT_VALUES_98(fmt, value, ...) value, COMPACT_VALUES_96(__VA_ARGS__) +#define COMPACT_VALUES_100(fmt, value, ...) value, COMPACT_VALUES_98(__VA_ARGS__) +#define COMPACT_VALUES_102(fmt, value, ...) value, COMPACT_VALUES_100(__VA_ARGS__) +#define COMPACT_VALUES_104(fmt, value, ...) value, COMPACT_VALUES_102(__VA_ARGS__) +#define COMPACT_VALUES_106(fmt, value, ...) value, COMPACT_VALUES_104(__VA_ARGS__) +#define COMPACT_VALUES_108(fmt, value, ...) value, COMPACT_VALUES_106(__VA_ARGS__) +#define COMPACT_VALUES_110(fmt, value, ...) value, COMPACT_VALUES_108(__VA_ARGS__) +#define COMPACT_VALUES_112(fmt, value, ...) value, COMPACT_VALUES_110(__VA_ARGS__) +#define COMPACT_VALUES_114(fmt, value, ...) value, COMPACT_VALUES_112(__VA_ARGS__) +#define COMPACT_VALUES_116(fmt, value, ...) value, COMPACT_VALUES_114(__VA_ARGS__) +#define COMPACT_VALUES_118(fmt, value, ...) value, COMPACT_VALUES_116(__VA_ARGS__) +#define COMPACT_VALUES_120(fmt, value, ...) value, COMPACT_VALUES_118(__VA_ARGS__) + +#endif diff --git a/src/function_lua.c b/src/function_lua.c index 91bb5cd67a1..61a20a4c62e 100644 --- a/src/function_lua.c +++ b/src/function_lua.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2021, Redis Ltd. + * Copyright (c) 2021-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ /* @@ -51,7 +30,6 @@ #define REGISTRY_LOAD_CTX_NAME "__LIBRARY_CTX__" #define LIBRARY_API_NAME "__LIBRARY_API__" #define GLOBALS_API_NAME "__GLOBALS_API__" -#define LOAD_TIMEOUT_MS 500 /* Lua engine ctx */ typedef struct luaEngineCtx { @@ -67,6 +45,7 @@ typedef struct luaFunctionCtx { typedef struct loadCtx { functionLibInfo *li; monotime start_time; + size_t timeout; } loadCtx; typedef struct registerFunctionArgs { @@ -85,7 +64,7 @@ static void luaEngineLoadHook(lua_State *lua, lua_Debug *ar) { loadCtx *load_ctx = luaGetFromRegistry(lua, REGISTRY_LOAD_CTX_NAME); serverAssert(load_ctx); /* Only supported inside script invocation */ uint64_t duration = elapsedMs(load_ctx->start_time); - if (duration > LOAD_TIMEOUT_MS) { + if (load_ctx->timeout > 0 && duration > load_ctx->timeout) { lua_sethook(lua, luaEngineLoadHook, LUA_MASKLINE, 0); luaPushError(lua,"FUNCTION LOAD timeout"); @@ -100,7 +79,7 @@ static void luaEngineLoadHook(lua_State *lua, lua_Debug *ar) { * * Return NULL on compilation error and set the error to the err variable */ -static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, sds *err) { +static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size_t timeout, sds *err) { int ret = C_ERR; luaEngineCtx *lua_engine_ctx = engine_ctx; lua_State *lua = lua_engine_ctx->lua; @@ -124,6 +103,7 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, sds loadCtx load_ctx = { .li = li, .start_time = getMonotonicUs(), + .timeout = timeout, }; luaSaveOnRegistry(lua, REGISTRY_LOAD_CTX_NAME, &load_ctx); @@ -422,7 +402,7 @@ static int luaRegisterFunction(lua_State *lua) { /* Initialize Lua engine, should be called once on start. */ int luaEngineInitEngine(void) { luaEngineCtx *lua_engine_ctx = zmalloc(sizeof(*lua_engine_ctx)); - lua_engine_ctx->lua = lua_open(); + lua_engine_ctx->lua = createLuaState(); luaRegisterRedisAPI(lua_engine_ctx->lua); diff --git a/src/functions.c b/src/functions.c index f5738ba79d6..427cda8d003 100644 --- a/src/functions.c +++ b/src/functions.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2021, Redis Ltd. + * Copyright (c) 2011-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "functions.h" @@ -33,6 +12,8 @@ #include "adlist.h" #include "atomicvar.h" +#define LOAD_TIMEOUT_MS 500 + typedef enum { restorePolicy_Flush, restorePolicy_Append, restorePolicy_Replace } restorePolicy; @@ -116,10 +97,7 @@ dictType librariesDictType = { /* Dictionary of engines */ static dict *engines = NULL; -/* Libraries Ctx. - * Contains the dictionary that map a library name to library object, - * Contains the dictionary that map a function name to function object, - * and the cache memory used by all the functions */ +/* Libraries Ctx. */ static functionsLibCtx *curr_functions_lib_ctx = NULL; static size_t functionMallocSize(functionInfo *fi) { @@ -497,7 +475,6 @@ static void functionListReplyFlags(client *c, functionInfo *fi) { * Return general information about all the libraries: * * Library name * * The engine used to run the Library - * * Library description * * Functions list * * Library code (if WITHCODE is given) * @@ -679,7 +656,6 @@ void fcallroCommand(client *c) { * is saved separately with the following information: * * Library name * * Engine name - * * Library description * * Library code * RDB_OPCODE_FUNCTION2 is saved before each library to present * that the payload is a library. @@ -838,7 +814,6 @@ void functionHelpCommand(client *c) { " Return general information on all the libraries:", " * Library name", " * The engine used to run the Library", -" * Library description", " * Functions list", " * Library code (if WITHCODE is given)", " It also possible to get only function that matches a pattern using LIBRARYNAME argument.", @@ -892,9 +867,7 @@ static int functionsVerifyName(sds name) { int functionExtractLibMetaData(sds payload, functionsLibMataData *md, sds *err) { sds name = NULL; - sds desc = NULL; sds engine = NULL; - sds code = NULL; if (strncmp(payload, "#!", 2) != 0) { *err = sdsnew("Missing library metadata"); return C_ERR; @@ -946,9 +919,7 @@ int functionExtractLibMetaData(sds payload, functionsLibMataData *md, sds *err) error: if (name) sdsfree(name); - if (desc) sdsfree(desc); if (engine) sdsfree(engine); - if (code) sdsfree(code); sdsfreesplitres(parts, numparts); return C_ERR; } @@ -961,7 +932,7 @@ void functionFreeLibMetaData(functionsLibMataData *md) { /* Compile and save the given library, return the loaded library name on success * and NULL on failure. In case on failure the err out param is set with relevant error message */ -sds functionsCreateWithLibraryCtx(sds code, int replace, sds* err, functionsLibCtx *lib_ctx) { +sds functionsCreateWithLibraryCtx(sds code, int replace, sds* err, functionsLibCtx *lib_ctx, size_t timeout) { dictIterator *iter = NULL; dictEntry *entry = NULL; functionLibInfo *new_li = NULL; @@ -995,7 +966,7 @@ sds functionsCreateWithLibraryCtx(sds code, int replace, sds* err, functionsLibC } new_li = engineLibraryCreate(md.name, ei, code); - if (engine->create(engine->engine_ctx, new_li, md.code, err) != C_OK) { + if (engine->create(engine->engine_ctx, new_li, md.code, timeout, err) != C_OK) { goto error; } @@ -1063,7 +1034,11 @@ void functionLoadCommand(client *c) { robj *code = c->argv[argc_pos]; sds err = NULL; sds library_name = NULL; - if (!(library_name = functionsCreateWithLibraryCtx(code->ptr, replace, &err, curr_functions_lib_ctx))) + size_t timeout = LOAD_TIMEOUT_MS; + if (mustObeyClient(c)) { + timeout = 0; + } + if (!(library_name = functionsCreateWithLibraryCtx(code->ptr, replace, &err, curr_functions_lib_ctx, timeout))) { addReplyErrorSds(c, err); return; @@ -1078,15 +1053,15 @@ void functionLoadCommand(client *c) { unsigned long functionsMemory(void) { dictIterator *iter = dictGetIterator(engines); dictEntry *entry = NULL; - size_t engines_nemory = 0; + size_t engines_memory = 0; while ((entry = dictNext(iter))) { engineInfo *ei = dictGetVal(entry); engine *engine = ei->engine; - engines_nemory += engine->get_used_memory(engine->engine_ctx); + engines_memory += engine->get_used_memory(engine->engine_ctx); } dictReleaseIterator(iter); - return engines_nemory; + return engines_memory; } /* Return memory overhead of all the engines combine */ @@ -1113,7 +1088,7 @@ dict* functionsLibGet(void) { return curr_functions_lib_ctx->libraries; } -size_t functionsLibCtxfunctionsLen(functionsLibCtx *functions_ctx) { +size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx) { return dictSize(functions_ctx->functions); } diff --git a/src/functions.h b/src/functions.h index 26e45babc54..1d69e3794af 100644 --- a/src/functions.h +++ b/src/functions.h @@ -1,30 +1,9 @@ /* - * Copyright (c) 2021, Redis Ltd. + * Copyright (c) 2021-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __FUNCTIONS_H_ @@ -32,11 +11,16 @@ /* * functions.c unit provides the Redis Functions API: - * * FUNCTION CREATE - * * FUNCTION CALL + * * FUNCTION LOAD + * * FUNCTION LIST + * * FUNCTION CALL (FCALL and FCALL_RO) * * FUNCTION DELETE + * * FUNCTION STATS * * FUNCTION KILL - * * FUNCTION INFO + * * FUNCTION FLUSH + * * FUNCTION DUMP + * * FUNCTION RESTORE + * * FUNCTION HELP * * Also contains implementation for: * * Save/Load function from rdb @@ -53,9 +37,14 @@ typedef struct engine { /* engine specific context */ void *engine_ctx; - /* Create function callback, get the engine_ctx, and function code. - * returns NULL on error and set sds to be the error message */ - int (*create)(void *engine_ctx, functionLibInfo *li, sds code, sds *err); + /* Create function callback, get the engine_ctx, and function code + * engine_ctx - opaque struct that was created on engine initialization + * li - library information that need to be provided and when add functions + * code - the library code + * timeout - timeout for the library creation (0 for no timeout) + * err - description of error (if occurred) + * returns C_ERR on error and set err to be the error message */ + int (*create)(void *engine_ctx, functionLibInfo *li, sds code, size_t timeout, sds *err); /* Invoking a function, r_ctx is an opaque object (from engine POV). * The r_ctx should be used by the engine to interaction with Redis, @@ -109,13 +98,13 @@ struct functionLibInfo { }; int functionsRegisterEngine(const char *engine_name, engine *engine_ctx); -sds functionsCreateWithLibraryCtx(sds code, int replace, sds* err, functionsLibCtx *lib_ctx); +sds functionsCreateWithLibraryCtx(sds code, int replace, sds* err, functionsLibCtx *lib_ctx, size_t timeout); unsigned long functionsMemory(void); unsigned long functionsMemoryOverhead(void); unsigned long functionsNum(void); unsigned long functionsLibNum(void); dict* functionsLibGet(void); -size_t functionsLibCtxfunctionsLen(functionsLibCtx *functions_ctx); +size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx); functionsLibCtx* functionsLibCtxGetCurrent(void); functionsLibCtx* functionsLibCtxCreate(void); void functionsLibCtxClearCurrent(int async); diff --git a/src/geo.c b/src/geo.c index ac25a20c6cb..90817998a19 100644 --- a/src/geo.c +++ b/src/geo.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2014, Matt Stancliff . - * Copyright (c) 2015-2016, Salvatore Sanfilippo . + * Copyright (c) 2015-current, Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -300,7 +300,7 @@ int geoGetPointsInRange(robj *zobj, double min, double max, GeoShape *shape, geo zskiplist *zsl = zs->zsl; zskiplistNode *ln; - if ((ln = zslFirstInRange(zsl, &range)) == NULL) { + if ((ln = zslNthInRange(zsl, &range, 0)) == NULL) { /* Nothing exists starting at our min. No results. */ return 0; } @@ -690,7 +690,7 @@ void georadiusGeneric(client *c, int srcKeyIndex, int flags) { } if (any && !count) { - addReplyErrorFormat(c, "the ANY argument requires COUNT argument"); + addReplyError(c, "the ANY argument requires COUNT argument"); return; } diff --git a/src/geohash.c b/src/geohash.c index 2cbcf287543..e9f0c654dd6 100644 --- a/src/geohash.c +++ b/src/geohash.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2013-2014, yinqiwen * Copyright (c) 2014, Matt Stancliff . - * Copyright (c) 2015-2016, Salvatore Sanfilippo . + * Copyright (c) 2015-current, Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/geohash.h b/src/geohash.h index 4befb93039e..19fa5a1d0fd 100644 --- a/src/geohash.h +++ b/src/geohash.h @@ -1,7 +1,7 @@ /* * Copyright (c) 2013-2014, yinqiwen * Copyright (c) 2014, Matt Stancliff . - * Copyright (c) 2015, Salvatore Sanfilippo . + * Copyright (c) 2015-current, Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/geohash_helper.c b/src/geohash_helper.c index a3816fbe337..ba373268921 100644 --- a/src/geohash_helper.c +++ b/src/geohash_helper.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2013-2014, yinqiwen * Copyright (c) 2014, Matt Stancliff . - * Copyright (c) 2015-2016, Salvatore Sanfilippo . + * Copyright (c) 2015-current, Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/geohash_helper.h b/src/geohash_helper.h index 56c731fadd6..262bd8e8da3 100644 --- a/src/geohash_helper.h +++ b/src/geohash_helper.h @@ -1,7 +1,7 @@ /* * Copyright (c) 2013-2014, yinqiwen * Copyright (c) 2014, Matt Stancliff . - * Copyright (c) 2015, Salvatore Sanfilippo . + * Copyright (c) 2015-current, Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 1a74f479377..cb0929f47e8 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -1,32 +1,11 @@ /* hyperloglog.c - Redis HyperLogLog probabilistic cardinality approximation. * This file implements the algorithm and the exported Redis commands. * - * Copyright (c) 2014, Salvatore Sanfilippo + * Copyright (c) 2014-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -1220,10 +1199,10 @@ void pfaddCommand(client *c) { } hdr = o->ptr; if (updated) { + HLL_INVALIDATE_CACHE(hdr); signalModifiedKey(c,c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_STRING,"pfadd",c->argv[1],c->db->id); server.dirty += updated; - HLL_INVALIDATE_CACHE(hdr); } addReply(c, updated ? shared.cone : shared.czero); } diff --git a/src/intset.c b/src/intset.c index 621a74283a2..5216251eb52 100644 --- a/src/intset.c +++ b/src/intset.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2009-2012, Pieter Noordhuis - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-current, Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/intset.h b/src/intset.h index 41cc7b8222a..4259aaa7930 100644 --- a/src/intset.h +++ b/src/intset.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2009-2012, Pieter Noordhuis - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-current, Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/kvstore.c b/src/kvstore.c new file mode 100644 index 00000000000..890b85c13c3 --- /dev/null +++ b/src/kvstore.c @@ -0,0 +1,1033 @@ +/* + * Index-based KV store implementation + * This file implements a KV store comprised of an array of dicts (see dict.c) + * The purpose of this KV store is to have easy access to all keys that belong + * in the same dict (i.e. are in the same dict-index) + * + * For example, when Redis is running in cluster mode, we use kvstore to save + * all keys that map to the same hash-slot in a separate dict within the kvstore + * struct. + * This enables us to easily access all keys that map to a specific hash-slot. + * + * Copyright (c) 2011-Present, Redis Ltd. and contributors. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). + */ +#include "fmacros.h" + +#include +#include + +#include "zmalloc.h" +#include "kvstore.h" +#include "redisassert.h" +#include "monotonic.h" + +#define UNUSED(V) ((void) V) + +struct _kvstore { + int flags; + dictType dtype; + dict **dicts; + long long num_dicts; + long long num_dicts_bits; + list *rehashing; /* List of dictionaries in this kvstore that are currently rehashing. */ + int resize_cursor; /* Cron job uses this cursor to gradually resize dictionaries (only used if num_dicts > 1). */ + int allocated_dicts; /* The number of allocated dicts. */ + int non_empty_dicts; /* The number of non-empty dicts. */ + unsigned long long key_count; /* Total number of keys in this kvstore. */ + unsigned long long bucket_count; /* Total number of buckets in this kvstore across dictionaries. */ + unsigned long long *dict_size_index; /* Binary indexed tree (BIT) that describes cumulative key frequencies up until given dict-index. */ + size_t overhead_hashtable_lut; /* The overhead of all dictionaries. */ + size_t overhead_hashtable_rehashing; /* The overhead of dictionaries rehashing. */ +}; + +/* Structure for kvstore iterator that allows iterating across multiple dicts. */ +struct _kvstoreIterator { + kvstore *kvs; + long long didx; + long long next_didx; + dictIterator di; +}; + +/* Structure for kvstore dict iterator that allows iterating the corresponding dict. */ +struct _kvstoreDictIterator { + kvstore *kvs; + long long didx; + dictIterator di; +}; + +/* Dict metadata for database, used for record the position in rehashing list. */ +typedef struct { + listNode *rehashing_node; /* list node in rehashing list */ +} kvstoreDictMetadata; + +/**********************************/ +/*** Helpers **********************/ +/**********************************/ + +/* Get the dictionary pointer based on dict-index. */ +static dict *kvstoreGetDict(kvstore *kvs, int didx) { + return kvs->dicts[didx]; +} + +static dict **kvstoreGetDictRef(kvstore *kvs, int didx) { + return &kvs->dicts[didx]; +} + +static int kvstoreDictIsRehashingPaused(kvstore *kvs, int didx) +{ + dict *d = kvstoreGetDict(kvs, didx); + return d ? dictIsRehashingPaused(d) : 0; +} + +/* Returns total (cumulative) number of keys up until given dict-index (inclusive). + * Time complexity is O(log(kvs->num_dicts)). */ +static unsigned long long cumulativeKeyCountRead(kvstore *kvs, int didx) { + if (kvs->num_dicts == 1) { + assert(didx == 0); + return kvstoreSize(kvs); + } + int idx = didx + 1; + unsigned long long sum = 0; + while (idx > 0) { + sum += kvs->dict_size_index[idx]; + idx -= (idx & -idx); + } + return sum; +} + +static void addDictIndexToCursor(kvstore *kvs, int didx, unsigned long long *cursor) { + if (kvs->num_dicts == 1) + return; + /* didx can be -1 when iteration is over and there are no more dicts to visit. */ + if (didx < 0) + return; + *cursor = (*cursor << kvs->num_dicts_bits) | didx; +} + +static int getAndClearDictIndexFromCursor(kvstore *kvs, unsigned long long *cursor) { + if (kvs->num_dicts == 1) + return 0; + int didx = (int) (*cursor & (kvs->num_dicts-1)); + *cursor = *cursor >> kvs->num_dicts_bits; + return didx; +} + +/* Updates binary index tree (also known as Fenwick tree), increasing key count for a given dict. + * You can read more about this data structure here https://en.wikipedia.org/wiki/Fenwick_tree + * Time complexity is O(log(kvs->num_dicts)). */ +static void cumulativeKeyCountAdd(kvstore *kvs, int didx, long delta) { + kvs->key_count += delta; + + dict *d = kvstoreGetDict(kvs, didx); + size_t dsize = dictSize(d); + int non_empty_dicts_delta = dsize == 1? 1 : dsize == 0? -1 : 0; + kvs->non_empty_dicts += non_empty_dicts_delta; + + /* BIT does not need to be calculated when there's only one dict. */ + if (kvs->num_dicts == 1) + return; + + /* Update the BIT */ + int idx = didx + 1; /* Unlike dict indices, BIT is 1-based, so we need to add 1. */ + while (idx <= kvs->num_dicts) { + if (delta < 0) { + assert(kvs->dict_size_index[idx] >= (unsigned long long)labs(delta)); + } + kvs->dict_size_index[idx] += delta; + idx += (idx & -idx); + } +} + +/* Create the dict if it does not exist and return it. */ +static dict *createDictIfNeeded(kvstore *kvs, int didx) { + dict *d = kvstoreGetDict(kvs, didx); + if (d) return d; + + kvs->dicts[didx] = dictCreate(&kvs->dtype); + kvs->allocated_dicts++; + return kvs->dicts[didx]; +} + +/* Called when the dict will delete entries, the function will check + * KVSTORE_FREE_EMPTY_DICTS to determine whether the empty dict needs + * to be freed. + * + * Note that for rehashing dicts, that is, in the case of safe iterators + * and Scan, we won't delete the dict. We will check whether it needs + * to be deleted when we're releasing the iterator. */ +static void freeDictIfNeeded(kvstore *kvs, int didx) { + if (!(kvs->flags & KVSTORE_FREE_EMPTY_DICTS) || + !kvstoreGetDict(kvs, didx) || + kvstoreDictSize(kvs, didx) != 0 || + kvstoreDictIsRehashingPaused(kvs, didx)) + return; + dictRelease(kvs->dicts[didx]); + kvs->dicts[didx] = NULL; + kvs->allocated_dicts--; +} + +/**********************************/ +/*** dict callbacks ***************/ +/**********************************/ + +/* Adds dictionary to the rehashing list, which allows us + * to quickly find rehash targets during incremental rehashing. + * + * If there are multiple dicts, updates the bucket count for the given dictionary + * in a DB, bucket count incremented with the new ht size during the rehashing phase. + * If there's one dict, bucket count can be retrieved directly from single dict bucket. */ +static void kvstoreDictRehashingStarted(dict *d) { + kvstore *kvs = d->type->userdata; + kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); + listAddNodeTail(kvs->rehashing, d); + metadata->rehashing_node = listLast(kvs->rehashing); + + unsigned long long from, to; + dictRehashingInfo(d, &from, &to); + kvs->bucket_count += to; /* Started rehashing (Add the new ht size) */ + kvs->overhead_hashtable_lut += to; + kvs->overhead_hashtable_rehashing += from; +} + +/* Remove dictionary from the rehashing list. + * + * Updates the bucket count for the given dictionary in a DB. It removes + * the old ht size of the dictionary from the total sum of buckets for a DB. */ +static void kvstoreDictRehashingCompleted(dict *d) { + kvstore *kvs = d->type->userdata; + kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); + if (metadata->rehashing_node) { + listDelNode(kvs->rehashing, metadata->rehashing_node); + metadata->rehashing_node = NULL; + } + + unsigned long long from, to; + dictRehashingInfo(d, &from, &to); + kvs->bucket_count -= from; /* Finished rehashing (Remove the old ht size) */ + kvs->overhead_hashtable_lut -= from; + kvs->overhead_hashtable_rehashing -= from; +} + +/* Returns the size of the DB dict metadata in bytes. */ +static size_t kvstoreDictMetadataSize(dict *d) { + UNUSED(d); + return sizeof(kvstoreDictMetadata); +} + +/**********************************/ +/*** API **************************/ +/**********************************/ + +/* Create an array of dictionaries + * num_dicts_bits is the log2 of the amount of dictionaries needed (e.g. 0 for 1 dict, + * 3 for 8 dicts, etc.) */ +kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags) { + /* We can't support more than 2^16 dicts because we want to save 48 bits + * for the dict cursor, see kvstoreScan */ + assert(num_dicts_bits <= 16); + + kvstore *kvs = zcalloc(sizeof(*kvs)); + memcpy(&kvs->dtype, type, sizeof(kvs->dtype)); + kvs->flags = flags; + + /* kvstore must be the one to set these callbacks, so we make sure the + * caller didn't do it */ + assert(!type->userdata); + assert(!type->dictMetadataBytes); + assert(!type->rehashingStarted); + assert(!type->rehashingCompleted); + kvs->dtype.userdata = kvs; + kvs->dtype.dictMetadataBytes = kvstoreDictMetadataSize; + kvs->dtype.rehashingStarted = kvstoreDictRehashingStarted; + kvs->dtype.rehashingCompleted = kvstoreDictRehashingCompleted; + + kvs->num_dicts_bits = num_dicts_bits; + kvs->num_dicts = 1 << kvs->num_dicts_bits; + kvs->dicts = zcalloc(sizeof(dict*) * kvs->num_dicts); + if (!(kvs->flags & KVSTORE_ALLOCATE_DICTS_ON_DEMAND)) { + for (int i = 0; i < kvs->num_dicts; i++) + createDictIfNeeded(kvs, i); + } + + kvs->rehashing = listCreate(); + kvs->key_count = 0; + kvs->non_empty_dicts = 0; + kvs->resize_cursor = 0; + kvs->dict_size_index = kvs->num_dicts > 1? zcalloc(sizeof(unsigned long long) * (kvs->num_dicts + 1)) : NULL; + kvs->bucket_count = 0; + kvs->overhead_hashtable_lut = 0; + kvs->overhead_hashtable_rehashing = 0; + + return kvs; +} + +void kvstoreEmpty(kvstore *kvs, void(callback)(dict*)) { + for (int didx = 0; didx < kvs->num_dicts; didx++) { + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + continue; + kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); + if (metadata->rehashing_node) + metadata->rehashing_node = NULL; + dictEmpty(d, callback); + freeDictIfNeeded(kvs, didx); + } + + listEmpty(kvs->rehashing); + + kvs->key_count = 0; + kvs->non_empty_dicts = 0; + kvs->resize_cursor = 0; + kvs->bucket_count = 0; + if (kvs->dict_size_index) + memset(kvs->dict_size_index, 0, sizeof(unsigned long long) * (kvs->num_dicts + 1)); + kvs->overhead_hashtable_lut = 0; + kvs->overhead_hashtable_rehashing = 0; +} + +void kvstoreRelease(kvstore *kvs) { + for (int didx = 0; didx < kvs->num_dicts; didx++) { + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + continue; + kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); + if (metadata->rehashing_node) + metadata->rehashing_node = NULL; + dictRelease(d); + } + zfree(kvs->dicts); + + listRelease(kvs->rehashing); + if (kvs->dict_size_index) + zfree(kvs->dict_size_index); + + zfree(kvs); +} + +unsigned long long int kvstoreSize(kvstore *kvs) { + if (kvs->num_dicts != 1) { + return kvs->key_count; + } else { + return kvs->dicts[0]? dictSize(kvs->dicts[0]) : 0; + } +} + +/* This method provides the cumulative sum of all the dictionary buckets + * across dictionaries in a database. */ +unsigned long kvstoreBuckets(kvstore *kvs) { + if (kvs->num_dicts != 1) { + return kvs->bucket_count; + } else { + return kvs->dicts[0]? dictBuckets(kvs->dicts[0]) : 0; + } +} + +size_t kvstoreMemUsage(kvstore *kvs) { + size_t mem = sizeof(*kvs); + + unsigned long long keys_count = kvstoreSize(kvs); + mem += keys_count * dictEntryMemUsage() + + kvstoreBuckets(kvs) * sizeof(dictEntry*) + + kvs->allocated_dicts * (sizeof(dict) + kvstoreDictMetadataSize(NULL)); + + /* Values are dict* shared with kvs->dicts */ + mem += listLength(kvs->rehashing) * sizeof(listNode); + + if (kvs->dict_size_index) + mem += sizeof(unsigned long long) * (kvs->num_dicts + 1); + + return mem; +} + +/* + * This method is used to iterate over the elements of the entire kvstore specifically across dicts. + * It's a three pronged approach. + * + * 1. It uses the provided cursor `cursor` to retrieve the dict index from it. + * 2. If the dictionary is in a valid state checked through the provided callback `dictScanValidFunction`, + * it performs a dictScan over the appropriate `keyType` dictionary of `db`. + * 3. If the dict is entirely scanned i.e. the cursor has reached 0, the next non empty dict is discovered. + * The dict information is embedded into the cursor and returned. + * + * To restrict the scan to a single dict, pass a valid dict index as + * 'onlydidx', otherwise pass -1. + */ +unsigned long long kvstoreScan(kvstore *kvs, unsigned long long cursor, + int onlydidx, dictScanFunction *scan_cb, + kvstoreScanShouldSkipDict *skip_cb, + void *privdata) +{ + unsigned long long _cursor = 0; + /* During dictionary traversal, 48 upper bits in the cursor are used for positioning in the HT. + * Following lower bits are used for the dict index number, ranging from 0 to 2^num_dicts_bits-1. + * Dict index is always 0 at the start of iteration and can be incremented only if there are + * multiple dicts. */ + int didx = getAndClearDictIndexFromCursor(kvs, &cursor); + if (onlydidx >= 0) { + if (didx < onlydidx) { + /* Fast-forward to onlydidx. */ + assert(onlydidx < kvs->num_dicts); + didx = onlydidx; + cursor = 0; + } else if (didx > onlydidx) { + /* The cursor is already past onlydidx. */ + return 0; + } + } + + dict *d = kvstoreGetDict(kvs, didx); + + int skip = !d || (skip_cb && skip_cb(d)); + if (!skip) { + _cursor = dictScan(d, cursor, scan_cb, privdata); + /* In dictScan, scan_cb may delete entries (e.g., in active expire case). */ + freeDictIfNeeded(kvs, didx); + } + /* scanning done for the current dictionary or if the scanning wasn't possible, move to the next dict index. */ + if (_cursor == 0 || skip) { + if (onlydidx >= 0) + return 0; + didx = kvstoreGetNextNonEmptyDictIndex(kvs, didx); + } + if (didx == -1) { + return 0; + } + addDictIndexToCursor(kvs, didx, &_cursor); + return _cursor; +} + +/* + * This functions increases size of kvstore to match desired number. + * It resizes all individual dictionaries, unless skip_cb indicates otherwise. + * + * Based on the parameter `try_expand`, appropriate dict expand API is invoked. + * if try_expand is set to 1, `dictTryExpand` is used else `dictExpand`. + * The return code is either `DICT_OK`/`DICT_ERR` for both the API(s). + * `DICT_OK` response is for successful expansion. However, `DICT_ERR` response signifies failure in allocation in + * `dictTryExpand` call and in case of `dictExpand` call it signifies no expansion was performed. + */ +int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipDictIndex *skip_cb) { + for (int i = 0; i < kvs->num_dicts; i++) { + dict *d = kvstoreGetDict(kvs, i); + if (!d || (skip_cb && skip_cb(i))) + continue; + int result = try_expand ? dictTryExpand(d, newsize) : dictExpand(d, newsize); + if (try_expand && result == DICT_ERR) + return 0; + } + + return 1; +} + +/* Returns fair random dict index, probability of each dict being returned is proportional to the number of elements that dictionary holds. + * This function guarantees that it returns a dict-index of a non-empty dict, unless the entire kvstore is empty. + * Time complexity of this function is O(log(kvs->num_dicts)). */ +int kvstoreGetFairRandomDictIndex(kvstore *kvs) { + unsigned long target = kvstoreSize(kvs) ? (randomULong() % kvstoreSize(kvs)) + 1 : 0; + return kvstoreFindDictIndexByKeyIndex(kvs, target); +} + +void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full) { + buf[0] = '\0'; + + size_t l; + char *orig_buf = buf; + size_t orig_bufsize = bufsize; + dictStats *mainHtStats = NULL; + dictStats *rehashHtStats = NULL; + dict *d; + kvstoreIterator *kvs_it = kvstoreIteratorInit(kvs); + while ((d = kvstoreIteratorNextDict(kvs_it))) { + dictStats *stats = dictGetStatsHt(d, 0, full); + if (!mainHtStats) { + mainHtStats = stats; + } else { + dictCombineStats(stats, mainHtStats); + dictFreeStats(stats); + } + if (dictIsRehashing(d)) { + stats = dictGetStatsHt(d, 1, full); + if (!rehashHtStats) { + rehashHtStats = stats; + } else { + dictCombineStats(stats, rehashHtStats); + dictFreeStats(stats); + } + } + } + kvstoreIteratorRelease(kvs_it); + + if (mainHtStats && bufsize > 0) { + l = dictGetStatsMsg(buf, bufsize, mainHtStats, full); + dictFreeStats(mainHtStats); + buf += l; + bufsize -= l; + } + + if (rehashHtStats && bufsize > 0) { + l = dictGetStatsMsg(buf, bufsize, rehashHtStats, full); + dictFreeStats(rehashHtStats); + buf += l; + bufsize -= l; + } + /* Make sure there is a NULL term at the end. */ + if (orig_bufsize) orig_buf[orig_bufsize - 1] = '\0'; +} + +/* Finds a dict containing target element in a key space ordered by dict index. + * Consider this example. Dictionaries are represented by brackets and keys by dots: + * #0 #1 #2 #3 #4 + * [..][....][...][.......][.] + * ^ + * target + * + * In this case dict #3 contains key that we are trying to find. + * + * The return value is 0 based dict-index, and the range of the target is [1..kvstoreSize], kvstoreSize inclusive. + * + * To find the dict, we start with the root node of the binary index tree and search through its children + * from the highest index (2^num_dicts_bits in our case) to the lowest index. At each node, we check if the target + * value is greater than the node's value. If it is, we remove the node's value from the target and recursively + * search for the new target using the current node as the parent. + * Time complexity of this function is O(log(kvs->num_dicts)) + */ +int kvstoreFindDictIndexByKeyIndex(kvstore *kvs, unsigned long target) { + if (kvs->num_dicts == 1 || kvstoreSize(kvs) == 0) + return 0; + assert(target <= kvstoreSize(kvs)); + + int result = 0, bit_mask = 1 << kvs->num_dicts_bits; + for (int i = bit_mask; i != 0; i >>= 1) { + int current = result + i; + /* When the target index is greater than 'current' node value the we will update + * the target and search in the 'current' node tree. */ + if (target > kvs->dict_size_index[current]) { + target -= kvs->dict_size_index[current]; + result = current; + } + } + /* Adjust the result to get the correct dict: + * 1. result += 1; + * After the calculations, the index of target in dict_size_index should be the next one, + * so we should add 1. + * 2. result -= 1; + * Unlike BIT(dict_size_index is 1-based), dict indices are 0-based, so we need to subtract 1. + * As the addition and subtraction cancel each other out, we can simply return the result. */ + return result; +} + +/* Wrapper for kvstoreFindDictIndexByKeyIndex to get the first non-empty dict index in the kvstore. */ +int kvstoreGetFirstNonEmptyDictIndex(kvstore *kvs) { + return kvstoreFindDictIndexByKeyIndex(kvs, 1); +} + +/* Returns next non-empty dict index strictly after given one, or -1 if provided didx is the last one. */ +int kvstoreGetNextNonEmptyDictIndex(kvstore *kvs, int didx) { + if (kvs->num_dicts == 1) { + assert(didx == 0); + return -1; + } + unsigned long long next_key = cumulativeKeyCountRead(kvs, didx) + 1; + return next_key <= kvstoreSize(kvs) ? kvstoreFindDictIndexByKeyIndex(kvs, next_key) : -1; +} + +int kvstoreNumNonEmptyDicts(kvstore *kvs) { + return kvs->non_empty_dicts; +} + +int kvstoreNumAllocatedDicts(kvstore *kvs) { + return kvs->allocated_dicts; +} + +int kvstoreNumDicts(kvstore *kvs) { + return kvs->num_dicts; +} + +/* Returns kvstore iterator that can be used to iterate through sub-dictionaries. + * + * The caller should free the resulting kvs_it with kvstoreIteratorRelease. */ +kvstoreIterator *kvstoreIteratorInit(kvstore *kvs) { + kvstoreIterator *kvs_it = zmalloc(sizeof(*kvs_it)); + kvs_it->kvs = kvs; + kvs_it->didx = -1; + kvs_it->next_didx = kvstoreGetFirstNonEmptyDictIndex(kvs_it->kvs); /* Finds first non-empty dict index. */ + dictInitSafeIterator(&kvs_it->di, NULL); + return kvs_it; +} + +/* Free the kvs_it returned by kvstoreIteratorInit. */ +void kvstoreIteratorRelease(kvstoreIterator *kvs_it) { + dictIterator *iter = &kvs_it->di; + dictResetIterator(iter); + /* In the safe iterator context, we may delete entries. */ + freeDictIfNeeded(kvs_it->kvs, kvs_it->didx); + zfree(kvs_it); +} + + +/* Returns next dictionary from the iterator, or NULL if iteration is complete. + * + * - Takes care to reset the iter of the previous dict before moved to the next dict. + */ +dict *kvstoreIteratorNextDict(kvstoreIterator *kvs_it) { + if (kvs_it->next_didx == -1) + return NULL; + + /* The dict may be deleted during the iteration process, so here need to check for NULL. */ + if (kvs_it->didx != -1 && kvstoreGetDict(kvs_it->kvs, kvs_it->didx)) { + /* Before we move to the next dict, reset the iter of the previous dict. */ + dictIterator *iter = &kvs_it->di; + dictResetIterator(iter); + /* In the safe iterator context, we may delete entries. */ + freeDictIfNeeded(kvs_it->kvs, kvs_it->didx); + } + + kvs_it->didx = kvs_it->next_didx; + kvs_it->next_didx = kvstoreGetNextNonEmptyDictIndex(kvs_it->kvs, kvs_it->didx); + return kvs_it->kvs->dicts[kvs_it->didx]; +} + +int kvstoreIteratorGetCurrentDictIndex(kvstoreIterator *kvs_it) { + assert(kvs_it->didx >= 0 && kvs_it->didx < kvs_it->kvs->num_dicts); + return kvs_it->didx; +} + +/* Returns next entry. */ +dictEntry *kvstoreIteratorNext(kvstoreIterator *kvs_it) { + dictEntry *de = kvs_it->di.d ? dictNext(&kvs_it->di) : NULL; + if (!de) { /* No current dict or reached the end of the dictionary. */ + + /* Before we move to the next dict, function kvstoreIteratorNextDict() + * reset the iter of the previous dict & freeDictIfNeeded(). */ + dict *d = kvstoreIteratorNextDict(kvs_it); + + if (!d) + return NULL; + + dictInitSafeIterator(&kvs_it->di, d); + de = dictNext(&kvs_it->di); + } + return de; +} + +/* This method traverses through kvstore dictionaries and triggers a resize. + * It first tries to shrink if needed, and if it isn't, it tries to expand. */ +void kvstoreTryResizeDicts(kvstore *kvs, int limit) { + if (limit > kvs->num_dicts) + limit = kvs->num_dicts; + + for (int i = 0; i < limit; i++) { + int didx = kvs->resize_cursor; + dict *d = kvstoreGetDict(kvs, didx); + if (d && dictShrinkIfNeeded(d) == DICT_ERR) { + dictExpandIfNeeded(d); + } + kvs->resize_cursor = (didx + 1) % kvs->num_dicts; + } +} + +/* Our hash table implementation performs rehashing incrementally while + * we write/read from the hash table. Still if the server is idle, the hash + * table will use two tables for a long time. So we try to use threshold_us + * of CPU time at every call of this function to perform some rehashing. + * + * The function returns the amount of microsecs spent if some rehashing was + * performed, otherwise 0 is returned. */ +uint64_t kvstoreIncrementallyRehash(kvstore *kvs, uint64_t threshold_us) { + if (listLength(kvs->rehashing) == 0) + return 0; + + /* Our goal is to rehash as many dictionaries as we can before reaching threshold_us, + * after each dictionary completes rehashing, it removes itself from the list. */ + listNode *node; + monotime timer; + uint64_t elapsed_us = 0; + elapsedStart(&timer); + while ((node = listFirst(kvs->rehashing))) { + dictRehashMicroseconds(listNodeValue(node), threshold_us - elapsed_us); + + elapsed_us = elapsedUs(timer); + if (elapsed_us >= threshold_us) { + break; /* Reached the time limit. */ + } + } + return elapsed_us; +} + +size_t kvstoreOverheadHashtableLut(kvstore *kvs) { + return kvs->overhead_hashtable_lut * sizeof(dictEntry *); +} + +size_t kvstoreOverheadHashtableRehashing(kvstore *kvs) { + return kvs->overhead_hashtable_rehashing * sizeof(dictEntry *); +} + +unsigned long kvstoreDictRehashingCount(kvstore *kvs) { + return listLength(kvs->rehashing); +} + +unsigned long kvstoreDictSize(kvstore *kvs, int didx) +{ + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + return 0; + return dictSize(d); +} + +kvstoreDictIterator *kvstoreGetDictIterator(kvstore *kvs, int didx) +{ + kvstoreDictIterator *kvs_di = zmalloc(sizeof(*kvs_di)); + kvs_di->kvs = kvs; + kvs_di->didx = didx; + dictInitIterator(&kvs_di->di, kvstoreGetDict(kvs, didx)); + return kvs_di; +} + +kvstoreDictIterator *kvstoreGetDictSafeIterator(kvstore *kvs, int didx) +{ + kvstoreDictIterator *kvs_di = zmalloc(sizeof(*kvs_di)); + kvs_di->kvs = kvs; + kvs_di->didx = didx; + dictInitSafeIterator(&kvs_di->di, kvstoreGetDict(kvs, didx)); + return kvs_di; +} + +/* Free the kvs_di returned by kvstoreGetDictIterator and kvstoreGetDictSafeIterator. */ +void kvstoreReleaseDictIterator(kvstoreDictIterator *kvs_di) +{ + /* The dict may be deleted during the iteration process, so here need to check for NULL. */ + if (kvstoreGetDict(kvs_di->kvs, kvs_di->didx)) { + dictResetIterator(&kvs_di->di); + /* In the safe iterator context, we may delete entries. */ + freeDictIfNeeded(kvs_di->kvs, kvs_di->didx); + } + + zfree(kvs_di); +} + +/* Get the next element of the dict through kvstoreDictIterator and dictNext. */ +dictEntry *kvstoreDictIteratorNext(kvstoreDictIterator *kvs_di) +{ + /* The dict may be deleted during the iteration process, so here need to check for NULL. */ + dict *d = kvstoreGetDict(kvs_di->kvs, kvs_di->didx); + if (!d) return NULL; + + return dictNext(&kvs_di->di); +} + +dictEntry *kvstoreDictGetRandomKey(kvstore *kvs, int didx) +{ + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + return NULL; + return dictGetRandomKey(d); +} + +dictEntry *kvstoreDictGetFairRandomKey(kvstore *kvs, int didx) +{ + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + return NULL; + return dictGetFairRandomKey(d); +} + +dictEntry *kvstoreDictFindEntryByPtrAndHash(kvstore *kvs, int didx, const void *oldptr, uint64_t hash) +{ + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + return NULL; + return dictFindEntryByPtrAndHash(d, oldptr, hash); +} + +unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, unsigned int count) +{ + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + return 0; + return dictGetSomeKeys(d, des, count); +} + +int kvstoreDictExpand(kvstore *kvs, int didx, unsigned long size) +{ + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + return DICT_ERR; + return dictExpand(d, size); +} + +unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata) +{ + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + return 0; + return dictScanDefrag(d, v, fn, defragfns, privdata); +} + +/* Unlike kvstoreDictScanDefrag(), this method doesn't defrag the data(keys and values) + * within dict, it only reallocates the memory used by the dict structure itself using + * the provided allocation function. This feature was added for the active defrag feature. + * + * The 'defragfn' callback is called with a reference to the dict + * that callback can reallocate. */ +void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn) { + for (int didx = 0; didx < kvs->num_dicts; didx++) { + dict **d = kvstoreGetDictRef(kvs, didx), *newd; + if (!*d) + continue; + if ((newd = defragfn(*d))) { + *d = newd; + + /* After defragmenting the dict, update its corresponding + * rehashing node in the kvstore's rehashing list. */ + kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(*d); + if (metadata->rehashing_node) + metadata->rehashing_node->value = *d; + } + } +} + +uint64_t kvstoreGetHash(kvstore *kvs, const void *key) +{ + return kvs->dtype.hashFunction(key); +} + +void *kvstoreDictFetchValue(kvstore *kvs, int didx, const void *key) +{ + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + return NULL; + return dictFetchValue(d, key); +} + +dictEntry *kvstoreDictFind(kvstore *kvs, int didx, void *key) { + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + return NULL; + return dictFind(d, key); +} + +dictEntry *kvstoreDictAddRaw(kvstore *kvs, int didx, void *key, dictEntry **existing) { + dict *d = createDictIfNeeded(kvs, didx); + dictEntry *ret = dictAddRaw(d, key, existing); + if (ret) + cumulativeKeyCountAdd(kvs, didx, 1); + return ret; +} + +void kvstoreDictSetKey(kvstore *kvs, int didx, dictEntry* de, void *key) { + dict *d = kvstoreGetDict(kvs, didx); + dictSetKey(d, de, key); +} + +void kvstoreDictSetVal(kvstore *kvs, int didx, dictEntry *de, void *val) { + dict *d = kvstoreGetDict(kvs, didx); + dictSetVal(d, de, val); +} + +dictEntry *kvstoreDictTwoPhaseUnlinkFind(kvstore *kvs, int didx, const void *key, dictEntry ***plink, int *table_index) { + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + return NULL; + return dictTwoPhaseUnlinkFind(kvstoreGetDict(kvs, didx), key, plink, table_index); +} + +void kvstoreDictTwoPhaseUnlinkFree(kvstore *kvs, int didx, dictEntry *he, dictEntry **plink, int table_index) { + dict *d = kvstoreGetDict(kvs, didx); + dictTwoPhaseUnlinkFree(d, he, plink, table_index); + cumulativeKeyCountAdd(kvs, didx, -1); + freeDictIfNeeded(kvs, didx); +} + +int kvstoreDictDelete(kvstore *kvs, int didx, const void *key) { + dict *d = kvstoreGetDict(kvs, didx); + if (!d) + return DICT_ERR; + int ret = dictDelete(d, key); + if (ret == DICT_OK) { + cumulativeKeyCountAdd(kvs, didx, -1); + freeDictIfNeeded(kvs, didx); + } + return ret; +} + +#ifdef REDIS_TEST +#include +#include "testhelp.h" + +#define TEST(name) printf("test — %s\n", name); + +uint64_t hashTestCallback(const void *key) { + return dictGenHashFunction((unsigned char*)key, strlen((char*)key)); +} + +void freeTestCallback(dict *d, void *val) { + UNUSED(d); + zfree(val); +} + +void *defragAllocTest(void *ptr) { + size_t size = zmalloc_usable_size(ptr); + void *newptr = zmalloc(size); + memcpy(newptr, ptr, size); + zfree(ptr); + return newptr; +} + +dict *defragLUTTestCallback(dict *d) { + /* handle the dict struct */ + d = defragAllocTest(d); + /* handle the first hash table */ + d->ht_table[0] = defragAllocTest(d->ht_table[0]); + /* handle the second hash table */ + if (d->ht_table[1]) + d->ht_table[1] = defragAllocTest(d->ht_table[1]); + return d; +} + +dictType KvstoreDictTestType = { + hashTestCallback, + NULL, + NULL, + NULL, + freeTestCallback, + NULL, + NULL +}; + +char *stringFromInt(int value) { + char buf[32]; + int len; + char *s; + + len = snprintf(buf, sizeof(buf), "%d",value); + s = zmalloc(len+1); + memcpy(s, buf, len); + s[len] = '\0'; + return s; +} + +/* ./redis-server test kvstore */ +int kvstoreTest(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + int i; + void *key; + dictEntry *de; + kvstoreIterator *kvs_it; + kvstoreDictIterator *kvs_di; + + int didx = 0; + int curr_slot = 0; + kvstore *kvs1 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND); + kvstore *kvs2 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND | KVSTORE_FREE_EMPTY_DICTS); + + TEST("Add 16 keys") { + for (i = 0; i < 16; i++) { + de = kvstoreDictAddRaw(kvs1, didx, stringFromInt(i), NULL); + assert(de != NULL); + de = kvstoreDictAddRaw(kvs2, didx, stringFromInt(i), NULL); + assert(de != NULL); + } + assert(kvstoreDictSize(kvs1, didx) == 16); + assert(kvstoreSize(kvs1) == 16); + assert(kvstoreDictSize(kvs2, didx) == 16); + assert(kvstoreSize(kvs2) == 16); + } + + TEST("kvstoreIterator case 1: removing all keys does not delete the empty dict") { + kvs_it = kvstoreIteratorInit(kvs1); + while((de = kvstoreIteratorNext(kvs_it)) != NULL) { + curr_slot = kvstoreIteratorGetCurrentDictIndex(kvs_it); + key = dictGetKey(de); + assert(kvstoreDictDelete(kvs1, curr_slot, key) == DICT_OK); + } + kvstoreIteratorRelease(kvs_it); + + dict *d = kvstoreGetDict(kvs1, didx); + assert(d != NULL); + assert(kvstoreDictSize(kvs1, didx) == 0); + assert(kvstoreSize(kvs1) == 0); + } + + TEST("kvstoreIterator case 2: removing all keys will delete the empty dict") { + kvs_it = kvstoreIteratorInit(kvs2); + while((de = kvstoreIteratorNext(kvs_it)) != NULL) { + curr_slot = kvstoreIteratorGetCurrentDictIndex(kvs_it); + key = dictGetKey(de); + assert(kvstoreDictDelete(kvs2, curr_slot, key) == DICT_OK); + } + kvstoreIteratorRelease(kvs_it); + + /* Make sure the dict was removed from the rehashing list. */ + while (kvstoreIncrementallyRehash(kvs2, 1000)) {} + + dict *d = kvstoreGetDict(kvs2, didx); + assert(d == NULL); + assert(kvstoreDictSize(kvs2, didx) == 0); + assert(kvstoreSize(kvs2) == 0); + } + + TEST("Add 16 keys again") { + for (i = 0; i < 16; i++) { + de = kvstoreDictAddRaw(kvs1, didx, stringFromInt(i), NULL); + assert(de != NULL); + de = kvstoreDictAddRaw(kvs2, didx, stringFromInt(i), NULL); + assert(de != NULL); + } + assert(kvstoreDictSize(kvs1, didx) == 16); + assert(kvstoreSize(kvs1) == 16); + assert(kvstoreDictSize(kvs2, didx) == 16); + assert(kvstoreSize(kvs2) == 16); + } + + TEST("kvstoreDictIterator case 1: removing all keys does not delete the empty dict") { + kvs_di = kvstoreGetDictSafeIterator(kvs1, didx); + while((de = kvstoreDictIteratorNext(kvs_di)) != NULL) { + key = dictGetKey(de); + assert(kvstoreDictDelete(kvs1, didx, key) == DICT_OK); + } + kvstoreReleaseDictIterator(kvs_di); + + dict *d = kvstoreGetDict(kvs1, didx); + assert(d != NULL); + assert(kvstoreDictSize(kvs1, didx) == 0); + assert(kvstoreSize(kvs1) == 0); + } + + TEST("kvstoreDictIterator case 2: removing all keys will delete the empty dict") { + kvs_di = kvstoreGetDictSafeIterator(kvs2, didx); + while((de = kvstoreDictIteratorNext(kvs_di)) != NULL) { + key = dictGetKey(de); + assert(kvstoreDictDelete(kvs2, didx, key) == DICT_OK); + } + kvstoreReleaseDictIterator(kvs_di); + + dict *d = kvstoreGetDict(kvs2, didx); + assert(d == NULL); + assert(kvstoreDictSize(kvs2, didx) == 0); + assert(kvstoreSize(kvs2) == 0); + } + + TEST("Verify that a rehashing dict's node in the rehashing list is correctly updated after defragmentation") { + kvstore *kvs = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND); + for (i = 0; i < 256; i++) { + de = kvstoreDictAddRaw(kvs, 0, stringFromInt(i), NULL); + if (listLength(kvs->rehashing)) break; + } + assert(listLength(kvs->rehashing)); + kvstoreDictLUTDefrag(kvs, defragLUTTestCallback); + while (kvstoreIncrementallyRehash(kvs, 1000)) {} + kvstoreRelease(kvs); + } + + kvstoreRelease(kvs1); + kvstoreRelease(kvs2); + return 0; +} +#endif diff --git a/src/kvstore.h b/src/kvstore.h new file mode 100644 index 00000000000..bce45fe4c1b --- /dev/null +++ b/src/kvstore.h @@ -0,0 +1,79 @@ +#ifndef DICTARRAY_H_ +#define DICTARRAY_H_ + +#include "dict.h" +#include "adlist.h" + +typedef struct _kvstore kvstore; +typedef struct _kvstoreIterator kvstoreIterator; +typedef struct _kvstoreDictIterator kvstoreDictIterator; + +typedef int (kvstoreScanShouldSkipDict)(dict *d); +typedef int (kvstoreExpandShouldSkipDictIndex)(int didx); + +#define KVSTORE_ALLOCATE_DICTS_ON_DEMAND (1<<0) +#define KVSTORE_FREE_EMPTY_DICTS (1<<1) +kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags); +void kvstoreEmpty(kvstore *kvs, void(callback)(dict*)); +void kvstoreRelease(kvstore *kvs); +unsigned long long kvstoreSize(kvstore *kvs); +unsigned long kvstoreBuckets(kvstore *kvs); +size_t kvstoreMemUsage(kvstore *kvs); +unsigned long long kvstoreScan(kvstore *kvs, unsigned long long cursor, + int onlydidx, dictScanFunction *scan_cb, + kvstoreScanShouldSkipDict *skip_cb, + void *privdata); +int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipDictIndex *skip_cb); +int kvstoreGetFairRandomDictIndex(kvstore *kvs); +void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full); + +int kvstoreFindDictIndexByKeyIndex(kvstore *kvs, unsigned long target); +int kvstoreGetFirstNonEmptyDictIndex(kvstore *kvs); +int kvstoreGetNextNonEmptyDictIndex(kvstore *kvs, int didx); +int kvstoreNumNonEmptyDicts(kvstore *kvs); +int kvstoreNumAllocatedDicts(kvstore *kvs); +int kvstoreNumDicts(kvstore *kvs); +uint64_t kvstoreGetHash(kvstore *kvs, const void *key); + +/* kvstore iterator specific functions */ +kvstoreIterator *kvstoreIteratorInit(kvstore *kvs); +void kvstoreIteratorRelease(kvstoreIterator *kvs_it); +dict *kvstoreIteratorNextDict(kvstoreIterator *kvs_it); +int kvstoreIteratorGetCurrentDictIndex(kvstoreIterator *kvs_it); +dictEntry *kvstoreIteratorNext(kvstoreIterator *kvs_it); + +/* Rehashing */ +void kvstoreTryResizeDicts(kvstore *kvs, int limit); +uint64_t kvstoreIncrementallyRehash(kvstore *kvs, uint64_t threshold_us); +size_t kvstoreOverheadHashtableLut(kvstore *kvs); +size_t kvstoreOverheadHashtableRehashing(kvstore *kvs); +unsigned long kvstoreDictRehashingCount(kvstore *kvs); + +/* Specific dict access by dict-index */ +unsigned long kvstoreDictSize(kvstore *kvs, int didx); +kvstoreDictIterator *kvstoreGetDictIterator(kvstore *kvs, int didx); +kvstoreDictIterator *kvstoreGetDictSafeIterator(kvstore *kvs, int didx); +void kvstoreReleaseDictIterator(kvstoreDictIterator *kvs_id); +dictEntry *kvstoreDictIteratorNext(kvstoreDictIterator *kvs_di); +dictEntry *kvstoreDictGetRandomKey(kvstore *kvs, int didx); +dictEntry *kvstoreDictGetFairRandomKey(kvstore *kvs, int didx); +dictEntry *kvstoreDictFindEntryByPtrAndHash(kvstore *kvs, int didx, const void *oldptr, uint64_t hash); +unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, unsigned int count); +int kvstoreDictExpand(kvstore *kvs, int didx, unsigned long size); +unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata); +typedef dict *(kvstoreDictLUTDefragFunction)(dict *d); +void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn); +void *kvstoreDictFetchValue(kvstore *kvs, int didx, const void *key); +dictEntry *kvstoreDictFind(kvstore *kvs, int didx, void *key); +dictEntry *kvstoreDictAddRaw(kvstore *kvs, int didx, void *key, dictEntry **existing); +void kvstoreDictSetKey(kvstore *kvs, int didx, dictEntry* de, void *key); +void kvstoreDictSetVal(kvstore *kvs, int didx, dictEntry *de, void *val); +dictEntry *kvstoreDictTwoPhaseUnlinkFind(kvstore *kvs, int didx, const void *key, dictEntry ***plink, int *table_index); +void kvstoreDictTwoPhaseUnlinkFree(kvstore *kvs, int didx, dictEntry *he, dictEntry **plink, int table_index); +int kvstoreDictDelete(kvstore *kvs, int didx, const void *key); + +#ifdef REDIS_TEST +int kvstoreTest(int argc, char *argv[], int flags); +#endif + +#endif /* DICTARRAY_H_ */ diff --git a/src/latency.c b/src/latency.c index d46890e826f..db4c9044dd2 100644 --- a/src/latency.c +++ b/src/latency.c @@ -5,32 +5,11 @@ * * ---------------------------------------------------------------------------- * - * Copyright (c) 2014, Salvatore Sanfilippo + * Copyright (c) 2014-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -279,7 +258,7 @@ sds createLatencyReport(void) { /* Potentially commands. */ if (!strcasecmp(event,"command")) { - if (server.slowlog_log_slower_than < 0) { + if (server.slowlog_log_slower_than < 0 || server.slowlog_max_len == 0) { advise_slowlog_enabled = 1; advices++; } else if (server.slowlog_log_slower_than/1000 > diff --git a/src/latency.h b/src/latency.h index 13503d5c031..1951957c0af 100644 --- a/src/latency.h +++ b/src/latency.h @@ -3,32 +3,11 @@ * * ---------------------------------------------------------------------------- * - * Copyright (c) 2014, Salvatore Sanfilippo + * Copyright (c) 2014-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __LATENCY_H diff --git a/src/lazyfree.c b/src/lazyfree.c index 8ac55f77750..2b98f9a06fc 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -2,6 +2,8 @@ #include "bio.h" #include "atomicvar.h" #include "functions.h" +#include "cluster.h" +#include "ebuckets.h" static redisAtomic size_t lazyfree_objects = 0; static redisAtomic size_t lazyfreed_objects = 0; @@ -19,14 +21,23 @@ void lazyfreeFreeObject(void *args[]) { * database which was substituted with a fresh one in the main thread * when the database was logically deleted. */ void lazyfreeFreeDatabase(void *args[]) { - dict *ht1 = (dict *) args[0]; - dict *ht2 = (dict *) args[1]; - - size_t numkeys = dictSize(ht1); - dictRelease(ht1); - dictRelease(ht2); + kvstore *da1 = args[0]; + kvstore *da2 = args[1]; + ebuckets oldHfe = args[2]; + ebDestroy(&oldHfe, &hashExpireBucketsType, NULL); + size_t numkeys = kvstoreSize(da1); + kvstoreRelease(da1); + kvstoreRelease(da2); atomicDecr(lazyfree_objects,numkeys); atomicIncr(lazyfreed_objects,numkeys); + +#if defined(USE_JEMALLOC) + /* Only clear the current thread cache. + * Ignore the return call since this will fail if the tcache is disabled. */ + je_mallctl("thread.tcache.flush", NULL, NULL, NULL, 0); + + jemalloc_purge(); +#endif } /* Release the key tracking table. */ @@ -38,11 +49,22 @@ void lazyFreeTrackingTable(void *args[]) { atomicIncr(lazyfreed_objects,len); } +/* Release the error stats rax tree. */ +void lazyFreeErrors(void *args[]) { + rax *errors = args[0]; + size_t len = errors->numele; + raxFreeWithCallback(errors, zfree); + atomicDecr(lazyfree_objects,len); + atomicIncr(lazyfreed_objects,len); +} + /* Release the lua_scripts dict. */ void lazyFreeLuaScripts(void *args[]) { dict *lua_scripts = args[0]; + list *lua_scripts_lru_list = args[1]; + lua_State *lua = args[2]; long long len = dictSize(lua_scripts); - dictRelease(lua_scripts); + freeLuaScriptsSync(lua_scripts, lua_scripts_lru_list, lua); atomicDecr(lazyfree_objects,len); atomicIncr(lazyfreed_objects,len); } @@ -50,7 +72,7 @@ void lazyFreeLuaScripts(void *args[]) { /* Release the functions ctx. */ void lazyFreeFunctionsCtx(void *args[]) { functionsLibCtx *functions_lib_ctx = args[0]; - size_t len = functionsLibCtxfunctionsLen(functions_lib_ctx); + size_t len = functionsLibCtxFunctionsLen(functions_lib_ctx); functionsLibCtxFree(functions_lib_ctx); atomicDecr(lazyfree_objects,len); atomicIncr(lazyfreed_objects,len); @@ -174,11 +196,19 @@ void freeObjAsync(robj *key, robj *obj, int dbid) { * create a new empty set of hash tables and scheduling the old ones for * lazy freeing. */ void emptyDbAsync(redisDb *db) { - dict *oldht1 = db->dict, *oldht2 = db->expires; - db->dict = dictCreate(&dbDictType); - db->expires = dictCreate(&dbExpiresDictType); - atomicIncr(lazyfree_objects,dictSize(oldht1)); - bioCreateLazyFreeJob(lazyfreeFreeDatabase,2,oldht1,oldht2); + int slot_count_bits = 0; + int flags = KVSTORE_ALLOCATE_DICTS_ON_DEMAND; + if (server.cluster_enabled) { + slot_count_bits = CLUSTER_SLOT_MASK_BITS; + flags |= KVSTORE_FREE_EMPTY_DICTS; + } + kvstore *oldkeys = db->keys, *oldexpires = db->expires; + ebuckets oldHfe = db->hexpires; + db->keys = kvstoreCreate(&dbDictType, slot_count_bits, flags); + db->expires = kvstoreCreate(&dbExpiresDictType, slot_count_bits, flags); + db->hexpires = ebCreate(); + atomicIncr(lazyfree_objects, kvstoreSize(oldkeys)); + bioCreateLazyFreeJob(lazyfreeFreeDatabase, 3, oldkeys, oldexpires, oldHfe); } /* Free the key tracking table. @@ -193,20 +223,33 @@ void freeTrackingRadixTreeAsync(rax *tracking) { } } -/* Free lua_scripts dict, if the dict is huge enough, free it in async way. */ -void freeLuaScriptsAsync(dict *lua_scripts) { +/* Free the error stats rax tree. + * If the rax tree is huge enough, free it in async way. */ +void freeErrorsRadixTreeAsync(rax *errors) { + /* Because this rax has only keys and no values so we use numnodes. */ + if (errors->numnodes > LAZYFREE_THRESHOLD) { + atomicIncr(lazyfree_objects,errors->numele); + bioCreateLazyFreeJob(lazyFreeErrors,1,errors); + } else { + raxFreeWithCallback(errors, zfree); + } +} + +/* Free lua_scripts dict and lru list, if the dict is huge enough, free them in async way. + * Close lua interpreter, if there are a lot of lua scripts, close it in async way. */ +void freeLuaScriptsAsync(dict *lua_scripts, list *lua_scripts_lru_list, lua_State *lua) { if (dictSize(lua_scripts) > LAZYFREE_THRESHOLD) { atomicIncr(lazyfree_objects,dictSize(lua_scripts)); - bioCreateLazyFreeJob(lazyFreeLuaScripts,1,lua_scripts); + bioCreateLazyFreeJob(lazyFreeLuaScripts,3,lua_scripts,lua_scripts_lru_list,lua); } else { - dictRelease(lua_scripts); + freeLuaScriptsSync(lua_scripts, lua_scripts_lru_list, lua); } } /* Free functions ctx, if the functions ctx contains enough functions, free it in async way. */ void freeFunctionsAsync(functionsLibCtx *functions_lib_ctx) { - if (functionsLibCtxfunctionsLen(functions_lib_ctx) > LAZYFREE_THRESHOLD) { - atomicIncr(lazyfree_objects,functionsLibCtxfunctionsLen(functions_lib_ctx)); + if (functionsLibCtxFunctionsLen(functions_lib_ctx) > LAZYFREE_THRESHOLD) { + atomicIncr(lazyfree_objects,functionsLibCtxFunctionsLen(functions_lib_ctx)); bioCreateLazyFreeJob(lazyFreeFunctionsCtx,1,functions_lib_ctx); } else { functionsLibCtxFree(functions_lib_ctx); diff --git a/src/listpack.c b/src/listpack.c index ecc7e9f6fb8..5d9028e13d0 100644 --- a/src/listpack.c +++ b/src/listpack.c @@ -4,33 +4,11 @@ * * https://github.com/antirez/listpack * - * Copyright (c) 2017, Salvatore Sanfilippo - * Copyright (c) 2020, Redis Labs, Inc + * Copyright (c) 2017-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include @@ -169,7 +147,7 @@ int lpSafeToAdd(unsigned char* lp, size_t add) { * "utils.c", function string2ll(), and is copyright: * * Copyright(C) 2011, Pieter Noordhuis - * Copyright(C) 2011, Salvatore Sanfilippo + * Copyright(C) 2011-current, Redis Ltd. * * The function is released under the BSD 3-clause license. */ @@ -267,51 +245,61 @@ unsigned char* lpShrinkToFit(unsigned char *lp) { static inline void lpEncodeIntegerGetType(int64_t v, unsigned char *intenc, uint64_t *enclen) { if (v >= 0 && v <= 127) { /* Single byte 0-127 integer. */ - intenc[0] = v; - *enclen = 1; + if (intenc != NULL) intenc[0] = v; + if (enclen != NULL) *enclen = 1; } else if (v >= -4096 && v <= 4095) { /* 13 bit integer. */ if (v < 0) v = ((int64_t)1<<13)+v; - intenc[0] = (v>>8)|LP_ENCODING_13BIT_INT; - intenc[1] = v&0xff; - *enclen = 2; + if (intenc != NULL) { + intenc[0] = (v>>8)|LP_ENCODING_13BIT_INT; + intenc[1] = v&0xff; + } + if (enclen != NULL) *enclen = 2; } else if (v >= -32768 && v <= 32767) { /* 16 bit integer. */ if (v < 0) v = ((int64_t)1<<16)+v; - intenc[0] = LP_ENCODING_16BIT_INT; - intenc[1] = v&0xff; - intenc[2] = v>>8; - *enclen = 3; + if (intenc != NULL) { + intenc[0] = LP_ENCODING_16BIT_INT; + intenc[1] = v&0xff; + intenc[2] = v>>8; + } + if (enclen != NULL) *enclen = 3; } else if (v >= -8388608 && v <= 8388607) { /* 24 bit integer. */ if (v < 0) v = ((int64_t)1<<24)+v; - intenc[0] = LP_ENCODING_24BIT_INT; - intenc[1] = v&0xff; - intenc[2] = (v>>8)&0xff; - intenc[3] = v>>16; - *enclen = 4; + if (intenc != NULL) { + intenc[0] = LP_ENCODING_24BIT_INT; + intenc[1] = v&0xff; + intenc[2] = (v>>8)&0xff; + intenc[3] = v>>16; + } + if (enclen != NULL) *enclen = 4; } else if (v >= -2147483648 && v <= 2147483647) { /* 32 bit integer. */ if (v < 0) v = ((int64_t)1<<32)+v; - intenc[0] = LP_ENCODING_32BIT_INT; - intenc[1] = v&0xff; - intenc[2] = (v>>8)&0xff; - intenc[3] = (v>>16)&0xff; - intenc[4] = v>>24; - *enclen = 5; + if (intenc != NULL) { + intenc[0] = LP_ENCODING_32BIT_INT; + intenc[1] = v&0xff; + intenc[2] = (v>>8)&0xff; + intenc[3] = (v>>16)&0xff; + intenc[4] = v>>24; + } + if (enclen != NULL) *enclen = 5; } else { /* 64 bit integer. */ uint64_t uv = v; - intenc[0] = LP_ENCODING_64BIT_INT; - intenc[1] = uv&0xff; - intenc[2] = (uv>>8)&0xff; - intenc[3] = (uv>>16)&0xff; - intenc[4] = (uv>>24)&0xff; - intenc[5] = (uv>>32)&0xff; - intenc[6] = (uv>>40)&0xff; - intenc[7] = (uv>>48)&0xff; - intenc[8] = uv>>56; - *enclen = 9; + if (intenc != NULL) { + intenc[0] = LP_ENCODING_64BIT_INT; + intenc[1] = uv&0xff; + intenc[2] = (uv>>8)&0xff; + intenc[3] = (uv>>16)&0xff; + intenc[4] = (uv>>24)&0xff; + intenc[5] = (uv>>32)&0xff; + intenc[6] = (uv>>40)&0xff; + intenc[7] = (uv>>48)&0xff; + intenc[8] = uv>>56; + } + if (enclen != NULL) *enclen = 9; } } @@ -681,50 +669,47 @@ unsigned char *lpGetValue(unsigned char *p, unsigned int *slen, long long *lval) return vstr; } -/* Find pointer to the entry equal to the specified entry. Skip 'skip' entries - * between every comparison. Returns NULL when the field could not be found. */ -unsigned char *lpFind(unsigned char *lp, unsigned char *p, unsigned char *s, - uint32_t slen, unsigned int skip) { +/* This is just a wrapper to lpGet() that is able to get an integer from an entry directly. + * Returns 1 and stores the integer in 'lval' if the entry is an integer. + * Returns 0 if the entry is a string. */ +int lpGetIntegerValue(unsigned char *p, long long *lval) { + int64_t ele_len; + if (!lpGet(p, &ele_len, NULL)) { + *lval = ele_len; + return 1; + } + return 0; +} + +/* Find pointer to the entry with a comparator callback. + * + * 'cmp' is a comparator callback. If it returns zero, current entry pointer + * will be returned. 'user' is passed to this callback. + * Skip 'skip' entries between every comparison. + * Returns NULL when the field could not be found. */ +unsigned char *lpFindCb(unsigned char *lp, unsigned char *p, + void *user, lpCmp cmp, unsigned int skip) +{ int skipcnt = 0; - unsigned char vencoding = 0; unsigned char *value; - int64_t ll, vll; + int64_t ll; uint64_t entry_size = 123456789; /* initialized to avoid warning. */ uint32_t lp_bytes = lpBytes(lp); - assert(p); + if (!p) + p = lpFirst(lp); + while (p) { if (skipcnt == 0) { value = lpGetWithSize(p, &ll, NULL, &entry_size); if (value) { /* check the value doesn't reach outside the listpack before accessing it */ assert(p >= lp + LP_HDR_SIZE && p + entry_size < lp + lp_bytes); - if (slen == ll && memcmp(value, s, slen) == 0) { - return p; - } - } else { - /* Find out if the searched field can be encoded. Note that - * we do it only the first time, once done vencoding is set - * to non-zero and vll is set to the integer value. */ - if (vencoding == 0) { - /* If the entry can be encoded as integer we set it to - * 1, else set it to UCHAR_MAX, so that we don't retry - * again the next time. */ - if (slen >= 32 || slen == 0 || !lpStringToInt64((const char*)s, slen, &vll)) { - vencoding = UCHAR_MAX; - } else { - vencoding = 1; - } - } - - /* Compare current entry with specified entry, do it only - * if vencoding != UCHAR_MAX because if there is no encoding - * possible for the field it can't be a valid integer. */ - if (vencoding != UCHAR_MAX && ll == vll) { - return p; - } } + if (cmp(lp, p, user, value, ll) == 0) + return p; + /* Reset skip count */ skipcnt = skip; p += entry_size; @@ -749,6 +734,62 @@ unsigned char *lpFind(unsigned char *lp, unsigned char *p, unsigned char *s, return NULL; } +struct lpFindArg { + unsigned char *s; /* Item to search */ + uint32_t slen; /* Item len */ + int vencoding; + int64_t vll; +}; + +/* Comparator function to find item */ +static inline int lpFindCmp(const unsigned char *lp, unsigned char *p, + void *user, unsigned char *s, long long slen) { + (void) lp; + (void) p; + struct lpFindArg *arg = user; + + if (s) { + if (slen == arg->slen && memcmp(arg->s, s, slen) == 0) { + return 0; + } + } else { + /* Find out if the searched field can be encoded. Note that + * we do it only the first time, once done vencoding is set + * to non-zero and vll is set to the integer value. */ + if (arg->vencoding == 0) { + /* If the entry can be encoded as integer we set it to + * 1, else set it to UCHAR_MAX, so that we don't retry + * again the next time. */ + if (arg->slen >= 32 || arg->slen == 0 || !lpStringToInt64((const char*)arg->s, arg->slen, &arg->vll)) { + arg->vencoding = UCHAR_MAX; + } else { + arg->vencoding = 1; + } + } + + /* Compare current entry with specified entry, do it only + * if vencoding != UCHAR_MAX because if there is no encoding + * possible for the field it can't be a valid integer. */ + if (arg->vencoding != UCHAR_MAX && slen == arg->vll) { + return 0; + } + } + + return 1; +} + +/* Find pointer to the entry equal to the specified entry. Skip 'skip' entries + * between every comparison. Returns NULL when the field could not be found. */ +unsigned char *lpFind(unsigned char *lp, unsigned char *p, unsigned char *s, + uint32_t slen, unsigned int skip) +{ + struct lpFindArg arg = { + .s = s, + .slen = slen + }; + return lpFindCb(lp, p, &arg, lpFindCmp, skip); +} + /* Insert, delete or replace the specified string element 'elestr' of length * 'size' or integer element 'eleint' at the specified position 'p', with 'p' * being a listpack element pointer obtained with lpFirst(), lpLast(), lpNext(), @@ -926,6 +967,140 @@ unsigned char *lpInsert(unsigned char *lp, unsigned char *elestr, unsigned char return lp; } +/* Insert the specified elements with 'entries' and 'len' at the specified + * position 'p', with 'p' being a listpack element pointer obtained with + * lpFirst(), lpLast(), lpNext(), lpPrev() or lpSeek(). + * + * This is similar to lpInsert() but allows you to insert batch of entries in + * one call. This function is more efficient than inserting entries one by one + * as it does single realloc()/memmove() calls for all the entries. + * + * In each listpackEntry, if 'sval' is not null, it is assumed entry is string + * and 'sval' and 'slen' will be used. Otherwise, 'lval' will be used to append + * the integer entry. + * + * The elements are inserted before or after the element pointed by 'p' + * depending on the 'where' argument, that can be LP_BEFORE or LP_AFTER. + * + * If 'newp' is not NULL, at the end of a successful call '*newp' will be set + * to the address of the element just added, so that it will be possible to + * continue an interaction with lpNext() and lpPrev(). + * + * Returns NULL on out of memory or when the listpack total length would exceed + * the max allowed size of 2^32-1, otherwise the new pointer to the listpack + * holding the new element is returned (and the old pointer passed is no longer + * considered valid). */ +unsigned char *lpBatchInsert(unsigned char *lp, unsigned char *p, int where, + listpackEntry *entries, unsigned int len, + unsigned char **newp) +{ + assert(where == LP_BEFORE || where == LP_AFTER); + assert(entries != NULL && len > 0); + + struct listpackInsertEntry { + int enctype; + uint64_t enclen; + unsigned char intenc[LP_MAX_INT_ENCODING_LEN]; + unsigned char backlen[LP_MAX_BACKLEN_SIZE]; + unsigned long backlen_size; + }; + + uint64_t addedlen = 0; /* The encoded length of the added elements. */ + struct listpackInsertEntry tmp[3]; /* Encoded entries */ + struct listpackInsertEntry *enc = tmp; + + if (len > sizeof(tmp) / sizeof(struct listpackInsertEntry)) { + /* If 'len' is larger than local buffer size, allocate on heap. */ + enc = zmalloc(len * sizeof(struct listpackInsertEntry)); + } + + /* If we need to insert after the current element, we just jump to the + * next element (that could be the EOF one) and handle the case of + * inserting before. So the function will actually deal with just one + * case: LP_BEFORE. */ + if (where == LP_AFTER) { + p = lpSkip(p); + where = LP_BEFORE; + ASSERT_INTEGRITY(lp, p); + } + + for (unsigned int i = 0; i < len; i++) { + listpackEntry *e = &entries[i]; + if (e->sval) { + /* Calling lpEncodeGetType() results into the encoded version of the + * element to be stored into 'intenc' in case it is representable as + * an integer: in that case, the function returns LP_ENCODING_INT. + * Otherwise, if LP_ENCODING_STR is returned, we'll have to call + * lpEncodeString() to actually write the encoded string on place + * later. + * + * Whatever the returned encoding is, 'enclen' is populated with the + * length of the encoded element. */ + enc[i].enctype = lpEncodeGetType(e->sval, e->slen, + enc[i].intenc, &enc[i].enclen); + } else { + enc[i].enctype = LP_ENCODING_INT; + lpEncodeIntegerGetType(e->lval, enc[i].intenc, &enc[i].enclen); + } + addedlen += enc[i].enclen; + + /* We need to also encode the backward-parsable length of the element + * and append it to the end: this allows to traverse the listpack from + * the end to the start. */ + enc[i].backlen_size = lpEncodeBacklen(enc[i].backlen, enc[i].enclen); + addedlen += enc[i].backlen_size; + } + + uint64_t old_listpack_bytes = lpGetTotalBytes(lp); + uint64_t new_listpack_bytes = old_listpack_bytes + addedlen; + if (new_listpack_bytes > UINT32_MAX) return NULL; + + /* Store the offset of the element 'p', so that we can obtain its + * address again after a reallocation. */ + unsigned long poff = p-lp; + unsigned char *dst = lp + poff; /* May be updated after reallocation. */ + + /* Realloc before: we need more room. */ + if (new_listpack_bytes > old_listpack_bytes && + new_listpack_bytes > lp_malloc_size(lp)) { + if ((lp = lp_realloc(lp,new_listpack_bytes)) == NULL) return NULL; + dst = lp + poff; + } + + /* Setup the listpack relocating the elements to make the exact room + * we need to store the new ones. */ + memmove(dst+addedlen,dst,old_listpack_bytes-poff); + + for (unsigned int i = 0; i < len; i++) { + listpackEntry *ent = &entries[i]; + + if (newp) + *newp = dst; + + if (enc[i].enctype == LP_ENCODING_INT) + memcpy(dst, enc[i].intenc, enc[i].enclen); + else + lpEncodeString(dst, ent->sval, ent->slen); + + dst += enc[i].enclen; + memcpy(dst, enc[i].backlen, enc[i].backlen_size); + dst += enc[i].backlen_size; + } + + /* Update header. */ + uint32_t num_elements = lpGetNumElements(lp); + if (num_elements != LP_HDR_NUMELE_UNKNOWN) { + if ((int64_t) len > (int64_t) LP_HDR_NUMELE_UNKNOWN - (int64_t) num_elements) + lpSetNumElements(lp, LP_HDR_NUMELE_UNKNOWN); + else + lpSetNumElements(lp,num_elements + len); + } + lpSetTotalBytes(lp,new_listpack_bytes); + if (enc != tmp) lp_free(enc); + + return lp; +} + /* This is just a wrapper for lpInsert() to directly use a string. */ unsigned char *lpInsertString(unsigned char *lp, unsigned char *s, uint32_t slen, unsigned char *p, int where, unsigned char **newp) @@ -973,6 +1148,20 @@ unsigned char *lpAppendInteger(unsigned char *lp, long long lval) { return lpInsertInteger(lp, lval, eofptr, LP_BEFORE, NULL); } +/* Append batch of entries to the listpack. + * + * This call is more efficient than multiple lpAppend() calls as it only does + * a single realloc() for all the given entries. + * + * In each listpackEntry, if 'sval' is not null, it is assumed entry is string + * and 'sval' and 'slen' will be used. Otherwise, 'lval' will be used to append + * the integer entry. */ +unsigned char *lpBatchAppend(unsigned char *lp, listpackEntry *entries, unsigned long len) { + uint64_t listpack_bytes = lpGetTotalBytes(lp); + unsigned char *eofptr = lp + listpack_bytes - 1; + return lpBatchInsert(lp, eofptr, LP_BEFORE, entries, len, NULL); +} + /* This is just a wrapper for lpInsert() to directly use a string to replace * the current element. The function returns the new listpack as return * value, and also updates the current cursor by updating '*p'. */ @@ -1221,13 +1410,17 @@ size_t lpBytes(unsigned char *lp) { return lpGetTotalBytes(lp); } -/* Returns the size of a listpack consisting of an integer repeated 'rep' times. */ -size_t lpEstimateBytesRepeatedInteger(long long lval, unsigned long rep) { +/* Returns the size 'lval' will require when encoded, in bytes */ +size_t lpEntrySizeInteger(long long lval) { uint64_t enclen; - unsigned char intenc[LP_MAX_INT_ENCODING_LEN]; - lpEncodeIntegerGetType(lval, intenc, &enclen); + lpEncodeIntegerGetType(lval, NULL, &enclen); unsigned long backlen = lpEncodeBacklen(NULL, enclen); - return LP_HDR_SIZE + (enclen + backlen) * rep + 1; + return enclen + backlen; +} + +/* Returns the size of a listpack consisting of an integer repeated 'rep' times. */ +size_t lpEstimateBytesRepeatedInteger(long long lval, unsigned long rep) { + return LP_HDR_SIZE + lpEntrySizeInteger(lval) * rep + 1; } /* Seek the specified element and returns the pointer to the seeked element. @@ -1430,15 +1623,20 @@ static inline void lpSaveValue(unsigned char *val, unsigned int len, int64_t lva /* Randomly select a pair of key and value. * total_count is a pre-computed length/2 of the listpack (to avoid calls to lpLength) * 'key' and 'val' are used to store the result key value pair. - * 'val' can be NULL if the value is not needed. */ -void lpRandomPair(unsigned char *lp, unsigned long total_count, listpackEntry *key, listpackEntry *val) { + * 'val' can be NULL if the value is not needed. + * 'tuple_len' indicates entry count of a single logical item. It should be 2 + * if listpack was saved as key-value pair or more for key-value-...(n_entries). */ +void lpRandomPair(unsigned char *lp, unsigned long total_count, + listpackEntry *key, listpackEntry *val, int tuple_len) +{ unsigned char *p; + assert(tuple_len >= 2); + /* Avoid div by zero on corrupt listpack */ assert(total_count); - /* Generate even numbers, because listpack saved K-V pair */ - int r = (rand() % total_count) * 2; + int r = (rand() % total_count) * tuple_len; assert((p = lpSeek(lp, r))); key->sval = lpGetValue(p, &(key->slen), &(key->lval)); @@ -1488,26 +1686,31 @@ void lpRandomEntries(unsigned char *lp, unsigned int count, listpackEntry *entri /* Randomly select count of key value pairs and store into 'keys' and * 'vals' args. The order of the picked entries is random, and the selections * are non-unique (repetitions are possible). - * The 'vals' arg can be NULL in which case we skip these. */ -void lpRandomPairs(unsigned char *lp, unsigned int count, listpackEntry *keys, listpackEntry *vals) { + * The 'vals' arg can be NULL in which case we skip these. + * 'tuple_len' indicates entry count of a single logical item. It should be 2 + * if listpack was saved as key-value pair or more for key-value-...(n_entries). */ +void lpRandomPairs(unsigned char *lp, unsigned int count, listpackEntry *keys, listpackEntry *vals, int tuple_len) { unsigned char *p, *key, *value; unsigned int klen = 0, vlen = 0; long long klval = 0, vlval = 0; + assert(tuple_len >= 2); + /* Notice: the index member must be first due to the use in uintCompare */ typedef struct { unsigned int index; unsigned int order; } rand_pick; rand_pick *picks = lp_malloc(sizeof(rand_pick)*count); - unsigned int total_size = lpLength(lp)/2; + unsigned int total_size = lpLength(lp)/tuple_len; /* Avoid div by zero on corrupt listpack */ assert(total_size); /* create a pool of random indexes (some may be duplicate). */ for (unsigned int i = 0; i < count; i++) { - picks[i].index = (rand() % total_size) * 2; /* Generate even indexes */ + /* Generate indexes that key exist at */ + picks[i].index = (rand() % total_size) * tuple_len; /* keep track of the order we picked them */ picks[i].order = i; } @@ -1529,8 +1732,11 @@ void lpRandomPairs(unsigned char *lp, unsigned int count, listpackEntry *keys, l lpSaveValue(value, vlen, vlval, &vals[storeorder]); pickindex++; } - lpindex += 2; - p = lpNext(lp, p); + lpindex += tuple_len; + + for (int i = 0; i < tuple_len - 1; i++) { + p = lpNext(lp, p); + } } lp_free(picks); @@ -1540,13 +1746,20 @@ void lpRandomPairs(unsigned char *lp, unsigned int count, listpackEntry *keys, l * 'vals' args. The selections are unique (no repetitions), and the order of * the picked entries is NOT-random. * The 'vals' arg can be NULL in which case we skip these. + * 'tuple_len' indicates entry count of a single logical item. It should be 2 + * if listpack was saved as key-value pair or more for key-value-...(n_entries). * The return value is the number of items picked which can be lower than the * requested count if the listpack doesn't hold enough pairs. */ -unsigned int lpRandomPairsUnique(unsigned char *lp, unsigned int count, listpackEntry *keys, listpackEntry *vals) { +unsigned int lpRandomPairsUnique(unsigned char *lp, unsigned int count, + listpackEntry *keys, listpackEntry *vals, + int tuple_len) +{ + assert(tuple_len >= 2); + unsigned char *p, *key; unsigned int klen = 0; long long klval = 0; - unsigned int total_size = lpLength(lp)/2; + unsigned int total_size = lpLength(lp)/tuple_len; unsigned int index = 0; if (count > total_size) count = total_size; @@ -1554,7 +1767,7 @@ unsigned int lpRandomPairsUnique(unsigned char *lp, unsigned int count, listpack p = lpFirst(lp); unsigned int picked = 0, remaining = count; while (picked < count && p) { - assert((p = lpNextRandom(lp, p, &index, remaining, 1))); + assert((p = lpNextRandom(lp, p, &index, remaining, tuple_len))); key = lpGetValue(p, &klen, &klval); lpSaveValue(key, klen, klval, &keys[picked]); assert((p = lpNext(lp, p))); @@ -1576,8 +1789,9 @@ unsigned int lpRandomPairsUnique(unsigned char *lp, unsigned int count, listpack * the end of the list. The 'index' needs to be initialized according to the * current zero-based index matching the position of the starting element 'p' * and is updated to match the returned element's zero-based index. If - * 'even_only' is nonzero, an element with an even index is picked, which is - * useful if the listpack represents a key-value pair sequence. + * 'tuple_len' indicates entry count of a single logical item. e.g. This is + * useful if listpack represents key-value pairs. In this case, tuple_len should + * be two and even indexes will be picked. * * Note that this function can return p. In order to skip the previously * returned element, you need to call lpNext() or lpDelete() after each call to @@ -1587,7 +1801,7 @@ unsigned int lpRandomPairsUnique(unsigned char *lp, unsigned int count, listpack * p = lpFirst(lp); * i = 0; * while (remaining > 0) { - * p = lpNextRandom(lp, p, &i, remaining--, 0); + * p = lpNextRandom(lp, p, &i, remaining--, 1); * * // ... Do stuff with p ... * @@ -1596,8 +1810,9 @@ unsigned int lpRandomPairsUnique(unsigned char *lp, unsigned int count, listpack * } */ unsigned char *lpNextRandom(unsigned char *lp, unsigned char *p, unsigned int *index, - unsigned int remaining, int even_only) + unsigned int remaining, int tuple_len) { + assert(tuple_len > 0); /* To only iterate once, every time we try to pick a member, the probability * we pick it is the quotient of the count left we want to pick and the * count still we haven't visited. This way, we could make every member be @@ -1605,15 +1820,14 @@ unsigned char *lpNextRandom(unsigned char *lp, unsigned char *p, unsigned int *i unsigned int i = *index; unsigned int total_size = lpLength(lp); while (i < total_size && p != NULL) { - if (even_only && i % 2 != 0) { + if (i % tuple_len != 0) { p = lpNext(lp, p); i++; continue; } /* Do we pick this element? */ - unsigned int available = total_size - i; - if (even_only) available /= 2; + unsigned int available = (total_size - i) / tuple_len; double randomDouble = ((double)rand()) / RAND_MAX; double threshold = ((double)remaining) / available; if (randomDouble <= threshold) { @@ -1809,6 +2023,24 @@ static int lpValidation(unsigned char *p, unsigned int head_count, void *userdat return ret; } +static int lpFindCbCmp(const unsigned char *lp, unsigned char *p, void *user, unsigned char *s, long long slen) { + assert(lp); + assert(p); + + char *n = user; + + if (!s) { + int64_t sval; + if (lpStringToInt64((const char*)n, strlen(n), &sval)) + return slen == sval ? 0 : 1; + } else { + if (strlen(n) == (size_t) slen && memcmp(n, s, slen) == 0) + return 0; + } + + return 1; +} + int listpackTest(int argc, char *argv[], int flags) { UNUSED(argc); UNUSED(argv); @@ -2053,6 +2285,111 @@ int listpackTest(int argc, char *argv[], int flags) { zfree(lp); } + TEST("Batch append") { + listpackEntry ent[6] = { + {.sval = (unsigned char*)mixlist[0], .slen = strlen(mixlist[0])}, + {.sval = (unsigned char*)mixlist[1], .slen = strlen(mixlist[1])}, + {.sval = (unsigned char*)mixlist[2], .slen = strlen(mixlist[2])}, + {.lval = 4294967296}, + {.sval = (unsigned char*)mixlist[3], .slen = strlen(mixlist[3])}, + {.lval = -100} + }; + + lp = lpNew(0); + lp = lpBatchAppend(lp, ent, 2); + verifyEntry(lpSeek(lp, 0), ent[0].sval, ent[0].slen); + verifyEntry(lpSeek(lp, 1), ent[1].sval, ent[1].slen); + assert(lpLength(lp) == 2); + + lp = lpBatchAppend(lp, &ent[2], 1); + verifyEntry(lpSeek(lp, 0), ent[0].sval, ent[0].slen); + verifyEntry(lpSeek(lp, 1), ent[1].sval, ent[1].slen); + verifyEntry(lpSeek(lp, 2), ent[2].sval, ent[2].slen); + assert(lpLength(lp) == 3); + + lp = lpDeleteRange(lp, 1, 1); + verifyEntry(lpSeek(lp, 0), ent[0].sval, ent[0].slen); + verifyEntry(lpSeek(lp, 1), ent[2].sval, ent[2].slen); + assert(lpLength(lp) == 2); + + lp = lpBatchAppend(lp, &ent[3], 3); + verifyEntry(lpSeek(lp, 0), ent[0].sval, ent[0].slen); + verifyEntry(lpSeek(lp, 1), ent[2].sval, ent[2].slen); + verifyEntry(lpSeek(lp, 2), (unsigned char*) "4294967296", 10); + verifyEntry(lpSeek(lp, 3), ent[4].sval, ent[4].slen); + verifyEntry(lpSeek(lp, 4), (unsigned char*) "-100", 4); + assert(lpLength(lp) == 5); + + lp = lpDeleteRange(lp, 1, 3); + verifyEntry(lpSeek(lp, 0), ent[0].sval, ent[0].slen); + verifyEntry(lpSeek(lp, 1), (unsigned char*) "-100", 4); + assert(lpLength(lp) == 2); + + lpFree(lp); + } + + TEST("Batch insert") { + lp = lpNew(0); + listpackEntry ent[6] = { + {.sval = (unsigned char*)mixlist[0], .slen = strlen(mixlist[0])}, + {.sval = (unsigned char*)mixlist[1], .slen = strlen(mixlist[1])}, + {.sval = (unsigned char*)mixlist[2], .slen = strlen(mixlist[2])}, + {.lval = 4294967296}, + {.sval = (unsigned char*)mixlist[3], .slen = strlen(mixlist[3])}, + {.lval = -100} + }; + + lp = lpBatchAppend(lp, ent, 4); + assert(lpLength(lp) == 4); + verifyEntry(lpSeek(lp, 0), ent[0].sval, ent[0].slen); + verifyEntry(lpSeek(lp, 1), ent[1].sval, ent[1].slen); + verifyEntry(lpSeek(lp, 2), ent[2].sval, ent[2].slen); + verifyEntry(lpSeek(lp, 3), (unsigned char*)"4294967296", 10); + + /* Insert with LP_BEFORE */ + p = lpSeek(lp, 3); + lp = lpBatchInsert(lp, p, LP_BEFORE, &ent[4], 2, &p); + verifyEntry(p, (unsigned char*)"-100", 4); + assert(lpLength(lp) == 6); + verifyEntry(lpSeek(lp, 0), ent[0].sval, ent[0].slen); + verifyEntry(lpSeek(lp, 1), ent[1].sval, ent[1].slen); + verifyEntry(lpSeek(lp, 2), ent[2].sval, ent[2].slen); + verifyEntry(lpSeek(lp, 3), ent[4].sval, ent[4].slen); + verifyEntry(lpSeek(lp, 4), (unsigned char*)"-100", 4); + verifyEntry(lpSeek(lp, 5), (unsigned char*)"4294967296", 10); + + lp = lpDeleteRange(lp, 1, 2); + assert(lpLength(lp) == 4); + verifyEntry(lpSeek(lp, 0), ent[0].sval, ent[0].slen); + verifyEntry(lpSeek(lp, 1), ent[4].sval, ent[4].slen); + verifyEntry(lpSeek(lp, 2), (unsigned char*)"-100", 4); + verifyEntry(lpSeek(lp, 3), (unsigned char*)"4294967296", 10); + + /* Insert with LP_AFTER */ + p = lpSeek(lp, 0); + lp = lpBatchInsert(lp, p, LP_AFTER, &ent[1], 2, &p); + verifyEntry(p, ent[2].sval, ent[2].slen); + assert(lpLength(lp) == 6); + verifyEntry(lpSeek(lp, 0), ent[0].sval, ent[0].slen); + verifyEntry(lpSeek(lp, 1), ent[1].sval, ent[1].slen); + verifyEntry(lpSeek(lp, 2), ent[2].sval, ent[2].slen); + verifyEntry(lpSeek(lp, 3), ent[4].sval, ent[4].slen); + verifyEntry(lpSeek(lp, 4), (unsigned char*)"-100", 4); + verifyEntry(lpSeek(lp, 5), (unsigned char*)"4294967296", 10); + + lp = lpDeleteRange(lp, 2, 4); + assert(lpLength(lp) == 2); + p = lpSeek(lp, 1); + lp = lpBatchInsert(lp, p, LP_AFTER, &ent[2], 1, &p); + verifyEntry(p, ent[2].sval, ent[2].slen); + assert(lpLength(lp) == 3); + verifyEntry(lpSeek(lp, 0), ent[0].sval, ent[0].slen); + verifyEntry(lpSeek(lp, 1), ent[1].sval, ent[1].slen); + verifyEntry(lpSeek(lp, 2), ent[2].sval, ent[2].slen); + + lpFree(lp); + } + TEST("Batch delete") { unsigned char *lp = createList(); /* char *mixlist[] = {"hello", "foo", "quux", "1024"} */ assert(lpLength(lp) == 4); /* Pre-condition */ @@ -2232,7 +2569,7 @@ int listpackTest(int argc, char *argv[], int flags) { unsigned index = 0; while (remaining > 0) { assert(p != NULL); - p = lpNextRandom(lp, p, &index, remaining--, 0); + p = lpNextRandom(lp, p, &index, remaining--, 1); assert(p != NULL); assert(p != prev); prev = p; @@ -2248,7 +2585,7 @@ int listpackTest(int argc, char *argv[], int flags) { unsigned i = 0; /* Pick from empty listpack returns NULL. */ - assert(lpNextRandom(lp, NULL, &i, 2, 0) == NULL); + assert(lpNextRandom(lp, NULL, &i, 2, 1) == NULL); /* Add some elements and find their pointers within the listpack. */ lp = lpAppend(lp, (unsigned char *)"abc", 3); @@ -2261,19 +2598,19 @@ int listpackTest(int argc, char *argv[], int flags) { assert(lpNext(lp, p2) == NULL); /* Pick zero elements returns NULL. */ - i = 0; assert(lpNextRandom(lp, lpFirst(lp), &i, 0, 0) == NULL); + i = 0; assert(lpNextRandom(lp, lpFirst(lp), &i, 0, 1) == NULL); /* Pick all returns all. */ - i = 0; assert(lpNextRandom(lp, p0, &i, 3, 0) == p0 && i == 0); - i = 1; assert(lpNextRandom(lp, p1, &i, 2, 0) == p1 && i == 1); - i = 2; assert(lpNextRandom(lp, p2, &i, 1, 0) == p2 && i == 2); + i = 0; assert(lpNextRandom(lp, p0, &i, 3, 1) == p0 && i == 0); + i = 1; assert(lpNextRandom(lp, p1, &i, 2, 1) == p1 && i == 1); + i = 2; assert(lpNextRandom(lp, p2, &i, 1, 1) == p2 && i == 2); /* Pick more than one when there's only one left returns the last one. */ - i = 2; assert(lpNextRandom(lp, p2, &i, 42, 0) == p2 && i == 2); + i = 2; assert(lpNextRandom(lp, p2, &i, 42, 1) == p2 && i == 2); /* Pick all even elements returns p0 and p2. */ - i = 0; assert(lpNextRandom(lp, p0, &i, 10, 1) == p0 && i == 0); - i = 1; assert(lpNextRandom(lp, p1, &i, 10, 1) == p2 && i == 2); + i = 0; assert(lpNextRandom(lp, p0, &i, 10, 2) == p0 && i == 0); + i = 1; assert(lpNextRandom(lp, p1, &i, 10, 2) == p2 && i == 2); /* Don't crash even for bad index. */ for (int j = 0; j < 100; j++) { @@ -2286,7 +2623,7 @@ int listpackTest(int argc, char *argv[], int flags) { } i = j % 7; unsigned int remaining = j % 5; - p = lpNextRandom(lp, p, &i, remaining, 0); + p = lpNextRandom(lp, p, &i, remaining, 1); assert(p == p0 || p == p1 || p == p2 || p == NULL); } lpFree(lp); @@ -2297,7 +2634,7 @@ int listpackTest(int argc, char *argv[], int flags) { unsigned char *lp = lpNew(0); lp = lpAppend(lp, (unsigned char*)"abc", 3); lp = lpAppend(lp, (unsigned char*)"123", 3); - lpRandomPair(lp, 1, &key, &val); + lpRandomPair(lp, 1, &key, &val, 2); assert(memcmp(key.sval, "abc", key.slen) == 0); assert(val.lval == 123); lpFree(lp); @@ -2310,7 +2647,7 @@ int listpackTest(int argc, char *argv[], int flags) { lp = lpAppend(lp, (unsigned char*)"123", 3); lp = lpAppend(lp, (unsigned char*)"456", 3); lp = lpAppend(lp, (unsigned char*)"def", 3); - lpRandomPair(lp, 2, &key, &val); + lpRandomPair(lp, 2, &key, &val, 2); if (key.sval) { assert(!memcmp(key.sval, "abc", key.slen)); assert(key.slen == 3); @@ -2323,6 +2660,42 @@ int listpackTest(int argc, char *argv[], int flags) { lpFree(lp); } + TEST("Random pair with tuple_len 3") { + listpackEntry key, val; + unsigned char *lp = lpNew(0); + lp = lpAppend(lp, (unsigned char*)"abc", 3); + lp = lpAppend(lp, (unsigned char*)"123", 3); + lp = lpAppend(lp, (unsigned char*)"xxx", 3); + lp = lpAppend(lp, (unsigned char*)"456", 3); + lp = lpAppend(lp, (unsigned char*)"def", 3); + lp = lpAppend(lp, (unsigned char*)"xxx", 3); + lp = lpAppend(lp, (unsigned char*)"281474976710655", 15); + lp = lpAppend(lp, (unsigned char*)"789", 3); + lp = lpAppend(lp, (unsigned char*)"xxx", 3); + + for (int i = 0; i < 5; i++) { + lpRandomPair(lp, 3, &key, &val, 3); + if (key.sval) { + if (!memcmp(key.sval, "abc", key.slen)) { + assert(key.slen == 3); + assert(val.lval == 123); + } else { + assert(0); + }; + } + if (!key.sval) { + if (key.lval == 456) + assert(!memcmp(val.sval, "def", val.slen)); + else if (key.lval == 281474976710655LL) + assert(val.lval == 789); + else + assert(0); + } + } + + lpFree(lp); + } + TEST("Random pairs with one element") { int count = 5; unsigned char *lp = lpNew(0); @@ -2331,7 +2704,7 @@ int listpackTest(int argc, char *argv[], int flags) { lp = lpAppend(lp, (unsigned char*)"abc", 3); lp = lpAppend(lp, (unsigned char*)"123", 3); - lpRandomPairs(lp, count, keys, vals); + lpRandomPairs(lp, count, keys, vals, 2); assert(memcmp(keys[4].sval, "abc", keys[4].slen) == 0); assert(vals[4].lval == 123); zfree(keys); @@ -2349,7 +2722,7 @@ int listpackTest(int argc, char *argv[], int flags) { lp = lpAppend(lp, (unsigned char*)"123", 3); lp = lpAppend(lp, (unsigned char*)"456", 3); lp = lpAppend(lp, (unsigned char*)"def", 3); - lpRandomPairs(lp, count, keys, vals); + lpRandomPairs(lp, count, keys, vals, 2); for (int i = 0; i < count; i++) { if (keys[i].sval) { assert(!memcmp(keys[i].sval, "abc", keys[i].slen)); @@ -2366,6 +2739,47 @@ int listpackTest(int argc, char *argv[], int flags) { lpFree(lp); } + TEST("Random pairs with many elements and tuple_len 3") { + int count = 5; + lp = lpNew(0); + listpackEntry *keys = zcalloc(sizeof(listpackEntry) * count); + listpackEntry *vals = zcalloc(sizeof(listpackEntry) * count); + + lp = lpAppend(lp, (unsigned char*)"abc", 3); + lp = lpAppend(lp, (unsigned char*)"123", 3); + lp = lpAppend(lp, (unsigned char*)"xxx", 3); + lp = lpAppend(lp, (unsigned char*)"456", 3); + lp = lpAppend(lp, (unsigned char*)"def", 3); + lp = lpAppend(lp, (unsigned char*)"xxx", 3); + lp = lpAppend(lp, (unsigned char*)"281474976710655", 15); + lp = lpAppend(lp, (unsigned char*)"789", 3); + lp = lpAppend(lp, (unsigned char*)"xxx", 3); + + lpRandomPairs(lp, count, keys, vals, 3); + for (int i = 0; i < count; i++) { + if (keys[i].sval) { + if (!memcmp(keys[i].sval, "abc", keys[i].slen)) { + assert(keys[i].slen == 3); + assert(vals[i].lval == 123); + } else { + assert(0); + }; + } + if (!keys[i].sval) { + if (keys[i].lval == 456) + assert(!memcmp(vals[i].sval, "def", vals[i].slen)); + else if (keys[i].lval == 281474976710655LL) + assert(vals[i].lval == 789); + else + assert(0); + } + } + + zfree(keys); + zfree(vals); + lpFree(lp); + } + TEST("Random pairs unique with one element") { unsigned picked; int count = 5; @@ -2375,7 +2789,7 @@ int listpackTest(int argc, char *argv[], int flags) { lp = lpAppend(lp, (unsigned char*)"abc", 3); lp = lpAppend(lp, (unsigned char*)"123", 3); - picked = lpRandomPairsUnique(lp, count, keys, vals); + picked = lpRandomPairsUnique(lp, count, keys, vals, 2); assert(picked == 1); assert(memcmp(keys[0].sval, "abc", keys[0].slen) == 0); assert(vals[0].lval == 123); @@ -2395,7 +2809,7 @@ int listpackTest(int argc, char *argv[], int flags) { lp = lpAppend(lp, (unsigned char*)"123", 3); lp = lpAppend(lp, (unsigned char*)"456", 3); lp = lpAppend(lp, (unsigned char*)"def", 3); - picked = lpRandomPairsUnique(lp, count, keys, vals); + picked = lpRandomPairsUnique(lp, count, keys, vals, 2); assert(picked == 2); for (int i = 0; i < 2; i++) { if (keys[i].sval) { @@ -2413,6 +2827,47 @@ int listpackTest(int argc, char *argv[], int flags) { lpFree(lp); } + TEST("Random pairs unique with many elements and tuple_len 3") { + unsigned picked; + int count = 5; + lp = lpNew(0); + listpackEntry *keys = zmalloc(sizeof(listpackEntry) * count); + listpackEntry *vals = zmalloc(sizeof(listpackEntry) * count); + + lp = lpAppend(lp, (unsigned char*)"abc", 3); + lp = lpAppend(lp, (unsigned char*)"123", 3); + lp = lpAppend(lp, (unsigned char*)"xxx", 3); + lp = lpAppend(lp, (unsigned char*)"456", 3); + lp = lpAppend(lp, (unsigned char*)"def", 3); + lp = lpAppend(lp, (unsigned char*)"xxx", 3); + lp = lpAppend(lp, (unsigned char*)"281474976710655", 15); + lp = lpAppend(lp, (unsigned char*)"789", 3); + lp = lpAppend(lp, (unsigned char*)"xxx", 3); + picked = lpRandomPairsUnique(lp, count, keys, vals, 3); + assert(picked == 3); + for (int i = 0; i < 3; i++) { + if (keys[i].sval) { + if (!memcmp(keys[i].sval, "abc", keys[i].slen)) { + assert(keys[i].slen == 3); + assert(vals[i].lval == 123); + } else { + assert(0); + }; + } + if (!keys[i].sval) { + if (keys[i].lval == 456) + assert(!memcmp(vals[i].sval, "def", vals[i].slen)); + else if (keys[i].lval == 281474976710655LL) + assert(vals[i].lval == 789); + else + assert(0); + } + } + zfree(keys); + zfree(vals); + lpFree(lp); + } + TEST("push various encodings") { lp = lpNew(0); @@ -2471,6 +2926,21 @@ int listpackTest(int argc, char *argv[], int flags) { lpFree(lp); } + TEST("Test lpFindCb") { + lp = createList(); /* "hello", "foo", "quux", "1024" */ + assert(lpFindCb(lp, lpFirst(lp), "abc", lpFindCbCmp, 0) == NULL); + verifyEntry(lpFindCb(lp, NULL, "hello", lpFindCbCmp, 0), (unsigned char*)"hello", 5); + verifyEntry(lpFindCb(lp, NULL, "1024", lpFindCbCmp, 0), (unsigned char*)"1024", 4); + verifyEntry(lpFindCb(lp, NULL, "quux", lpFindCbCmp, 0), (unsigned char*)"quux", 4); + verifyEntry(lpFindCb(lp, NULL, "foo", lpFindCbCmp, 0), (unsigned char*)"foo", 3); + lpFree(lp); + + lp = lpNew(0); + assert(lpFindCb(lp, lpFirst(lp), "hello", lpFindCbCmp, 0) == NULL); + assert(lpFindCb(lp, lpFirst(lp), "1024", lpFindCbCmp, 0) == NULL); + lpFree(lp); + } + TEST("Test lpValidateIntegrity") { lp = createList(); long count = 0; @@ -2493,6 +2963,26 @@ int listpackTest(int argc, char *argv[], int flags) { lpFree(lp); } + TEST("Test number of elements exceeds LP_HDR_NUMELE_UNKNOWN with batch insert") { + listpackEntry ent[2] = { + {.sval = (unsigned char*)mixlist[0], .slen = strlen(mixlist[0])}, + {.sval = (unsigned char*)mixlist[1], .slen = strlen(mixlist[1])} + }; + + lp = lpNew(0); + for (int i = 0; i < (LP_HDR_NUMELE_UNKNOWN/2) + 1; i++) + lp = lpBatchAppend(lp, ent, 2); + + assert(lpGetNumElements(lp) == LP_HDR_NUMELE_UNKNOWN); + assert(lpLength(lp) == LP_HDR_NUMELE_UNKNOWN+1); + + lp = lpDeleteRange(lp, -2, 2); + assert(lpGetNumElements(lp) == LP_HDR_NUMELE_UNKNOWN); + assert(lpLength(lp) == LP_HDR_NUMELE_UNKNOWN-1); + assert(lpGetNumElements(lp) == LP_HDR_NUMELE_UNKNOWN-1); /* update length after lpLength */ + lpFree(lp); + } + TEST("Stress with random payloads of different encoding") { unsigned long long start = usec(); int i,j,len,where; diff --git a/src/listpack.h b/src/listpack.h index a60f089f9cd..c9fbc56241b 100644 --- a/src/listpack.h +++ b/src/listpack.h @@ -4,32 +4,11 @@ * * https://github.com/antirez/listpack * - * Copyright (c) 2017, Salvatore Sanfilippo + * Copyright (c) 2017-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __LISTPACK_H @@ -70,18 +49,25 @@ unsigned char *lpReplaceInteger(unsigned char *lp, unsigned char **p, long long unsigned char *lpDelete(unsigned char *lp, unsigned char *p, unsigned char **newp); unsigned char *lpDeleteRangeWithEntry(unsigned char *lp, unsigned char **p, unsigned long num); unsigned char *lpDeleteRange(unsigned char *lp, long index, unsigned long num); +unsigned char *lpBatchAppend(unsigned char *lp, listpackEntry *entries, unsigned long len); +unsigned char *lpBatchInsert(unsigned char *lp, unsigned char *p, int where, + listpackEntry *entries, unsigned int len, unsigned char **newp); unsigned char *lpBatchDelete(unsigned char *lp, unsigned char **ps, unsigned long count); unsigned char *lpMerge(unsigned char **first, unsigned char **second); unsigned char *lpDup(unsigned char *lp); unsigned long lpLength(unsigned char *lp); unsigned char *lpGet(unsigned char *p, int64_t *count, unsigned char *intbuf); unsigned char *lpGetValue(unsigned char *p, unsigned int *slen, long long *lval); +int lpGetIntegerValue(unsigned char *p, long long *lval); unsigned char *lpFind(unsigned char *lp, unsigned char *p, unsigned char *s, uint32_t slen, unsigned int skip); +typedef int (*lpCmp)(const unsigned char *lp, unsigned char *p, void *user, unsigned char *s, long long slen); +unsigned char *lpFindCb(unsigned char *lp, unsigned char *p, void *user, lpCmp cmp, unsigned int skip); unsigned char *lpFirst(unsigned char *lp); unsigned char *lpLast(unsigned char *lp); unsigned char *lpNext(unsigned char *lp, unsigned char *p); unsigned char *lpPrev(unsigned char *lp, unsigned char *p); size_t lpBytes(unsigned char *lp); +size_t lpEntrySizeInteger(long long lval); size_t lpEstimateBytesRepeatedInteger(long long lval, unsigned long rep); unsigned char *lpSeek(unsigned char *lp, long index); typedef int (*listpackValidateEntryCB)(unsigned char *p, unsigned int head_count, void *userdata); @@ -90,12 +76,15 @@ int lpValidateIntegrity(unsigned char *lp, size_t size, int deep, unsigned char *lpValidateFirst(unsigned char *lp); int lpValidateNext(unsigned char *lp, unsigned char **pp, size_t lpbytes); unsigned int lpCompare(unsigned char *p, unsigned char *s, uint32_t slen); -void lpRandomPair(unsigned char *lp, unsigned long total_count, listpackEntry *key, listpackEntry *val); -void lpRandomPairs(unsigned char *lp, unsigned int count, listpackEntry *keys, listpackEntry *vals); -unsigned int lpRandomPairsUnique(unsigned char *lp, unsigned int count, listpackEntry *keys, listpackEntry *vals); +void lpRandomPair(unsigned char *lp, unsigned long total_count, + listpackEntry *key, listpackEntry *val, int tuple_len); +void lpRandomPairs(unsigned char *lp, unsigned int count, + listpackEntry *keys, listpackEntry *vals, int tuple_len); +unsigned int lpRandomPairsUnique(unsigned char *lp, unsigned int count, + listpackEntry *keys, listpackEntry *vals, int tuple_len); void lpRandomEntries(unsigned char *lp, unsigned int count, listpackEntry *entries); unsigned char *lpNextRandom(unsigned char *lp, unsigned char *p, unsigned int *index, - unsigned int remaining, int even_only); + unsigned int remaining, int tuple_len); int lpSafeToAdd(unsigned char* lp, size_t add); void lpRepr(unsigned char *lp); diff --git a/src/listpack_malloc.h b/src/listpack_malloc.h index a8a81c35e03..55c8cf5be2e 100644 --- a/src/listpack_malloc.h +++ b/src/listpack_malloc.h @@ -1,32 +1,11 @@ /* Listpack -- A lists of strings serialization format * https://github.com/antirez/listpack * - * Copyright (c) 2017, Salvatore Sanfilippo + * Copyright (c) 2017-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ /* Allocator selection. diff --git a/src/localtime.c b/src/localtime.c index 1cefdfa88cd..7f014cefcb4 100644 --- a/src/localtime.c +++ b/src/localtime.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2018, Salvatore Sanfilippo + * Copyright (c) 2018-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include diff --git a/src/logreqres.c b/src/logreqres.c index 6e7621d35db..a18bf3efb4e 100644 --- a/src/logreqres.c +++ b/src/logreqres.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2021, Redis Ltd. + * Copyright (c) 2021-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ /* This file implements the interface of logging clients' requests and diff --git a/src/lolwut.c b/src/lolwut.c index c014840e9af..34defdb70fc 100644 --- a/src/lolwut.c +++ b/src/lolwut.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2018, Salvatore Sanfilippo + * Copyright (c) 2018-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). * * ---------------------------------------------------------------------------- * diff --git a/src/lolwut.h b/src/lolwut.h index 682d00531f6..97471ac55e0 100644 --- a/src/lolwut.h +++ b/src/lolwut.h @@ -1,30 +1,9 @@ /* - * Copyright (c) 2018-2019, Salvatore Sanfilippo + * Copyright (c) 2018-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ /* This structure represents our canvas. Drawing functions will take a pointer diff --git a/src/lolwut5.c b/src/lolwut5.c index 1240168d0d8..9f20292f47d 100644 --- a/src/lolwut5.c +++ b/src/lolwut5.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2018, Salvatore Sanfilippo + * Copyright (c) 2018-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). * * ---------------------------------------------------------------------------- * diff --git a/src/lolwut6.c b/src/lolwut6.c index 1ba111c2d0b..1ccc643cb92 100644 --- a/src/lolwut6.c +++ b/src/lolwut6.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2019, Salvatore Sanfilippo + * Copyright (c) 2019-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). * * ---------------------------------------------------------------------------- * diff --git a/src/memtest.c b/src/memtest.c index 1ca4b82cf9c..f5f49d1d3d2 100644 --- a/src/memtest.c +++ b/src/memtest.c @@ -1,36 +1,14 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include #include #include #include -#include #include #include #include @@ -39,6 +17,7 @@ #include #endif #include "config.h" +#include "redisassert.h" #if (ULONG_MAX == 4294967295UL) #define MEMTEST_32BIT diff --git a/src/mkreleasehdr.sh b/src/mkreleasehdr.sh index 117b9e86f2d..04bc45a1695 100755 --- a/src/mkreleasehdr.sh +++ b/src/mkreleasehdr.sh @@ -1,6 +1,6 @@ #!/bin/sh GIT_SHA1=`(git show-ref --head --hash=8 2> /dev/null || echo 00000000) | head -n1` -GIT_DIRTY=`git diff --no-ext-diff 2> /dev/null | wc -l` +GIT_DIRTY=`git diff --no-ext-diff -- ../src ../deps 2> /dev/null | wc -l` BUILD_ID=`uname -n`"-"`date +%s` if [ -n "$SOURCE_DATE_EPOCH" ]; then BUILD_ID=$(date -u -d "@$SOURCE_DATE_EPOCH" +%s 2>/dev/null || date -u -r "$SOURCE_DATE_EPOCH" +%s 2>/dev/null || date -u +%s) diff --git a/src/module.c b/src/module.c index 0addeecde8f..3920f1cffdf 100644 --- a/src/module.c +++ b/src/module.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2016, Salvatore Sanfilippo + * Copyright (c) 2016-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ /* -------------------------------------------------------------------------- @@ -59,6 +38,7 @@ #include "script.h" #include "call_reply.h" #include "hdr_histogram.h" +#include "crc16_slottable.h" #include #include #include @@ -306,7 +286,6 @@ static size_t moduleTempClientMinCount = 0; /* Min client count in pool since * allow thread safe contexts to execute commands at a safe moment. */ static pthread_mutex_t moduleGIL = PTHREAD_MUTEX_INITIALIZER; - /* Function pointer type for keyspace event notification subscriptions from modules. */ typedef int (*RedisModuleNotificationFunc) (RedisModuleCtx *ctx, int type, const char *event, RedisModuleString *key); @@ -505,6 +484,10 @@ static struct redisCommandArg *moduleCopyCommandArgs(RedisModuleCommandArg *args static redisCommandArgType moduleConvertArgType(RedisModuleCommandArgType type, int *error); static int moduleConvertArgFlags(int flags); void moduleCreateContext(RedisModuleCtx *out_ctx, RedisModule *module, int ctx_flags); + +/* Common helper functions. */ +int moduleVerifyResourceName(const char *name); + /* -------------------------------------------------------------------------- * ## Heap allocation raw functions * @@ -542,11 +525,23 @@ void *RM_Calloc(size_t nmemb, size_t size) { return zcalloc_usable(nmemb*size,NULL); } +/* Similar to RM_Calloc, but returns NULL in case of allocation failure, instead + * of panicking. */ +void *RM_TryCalloc(size_t nmemb, size_t size) { + return ztrycalloc_usable(nmemb*size,NULL); +} + /* Use like realloc() for memory obtained with RedisModule_Alloc(). */ void* RM_Realloc(void *ptr, size_t bytes) { return zrealloc_usable(ptr,bytes,NULL); } +/* Similar to RM_Realloc, but returns NULL in case of allocation failure, + * instead of panicking. */ +void *RM_TryRealloc(void *ptr, size_t bytes) { + return ztryrealloc_usable(ptr,bytes,NULL); +} + /* Use like free() for memory obtained by RedisModule_Alloc() and * RedisModule_Realloc(). However you should never try to free with * RedisModule_Free() memory allocated with malloc() inside your module. */ @@ -750,7 +745,7 @@ int moduleDelKeyIfEmpty(RedisModuleKey *key) { case OBJ_LIST: isempty = listTypeLength(o) == 0; break; case OBJ_SET: isempty = setTypeSize(o) == 0; break; case OBJ_ZSET: isempty = zsetLength(o) == 0; break; - case OBJ_HASH: isempty = hashTypeLength(o) == 0; break; + case OBJ_HASH: isempty = hashTypeLength(o, 0) == 0; break; case OBJ_STREAM: isempty = streamLength(o) == 0; break; default: isempty = 0; } @@ -1447,6 +1442,45 @@ int populateArgsStructure(struct redisCommandArg *args) { return count; } +/* RedisModule_AddACLCategory can be used to add new ACL command categories. Category names + * can only contain alphanumeric characters, underscores, or dashes. Categories can only be added + * during the RedisModule_OnLoad function. Once a category has been added, it can not be removed. + * Any module can register a command to any added categories using RedisModule_SetCommandACLCategories. + * + * Returns: + * - REDISMODULE_OK on successfully adding the new ACL category. + * - REDISMODULE_ERR on failure. + * + * On error the errno is set to: + * - EINVAL if the name contains invalid characters. + * - EBUSY if the category name already exists. + * - ENOMEM if the number of categories reached the max limit of 64 categories. + */ +int RM_AddACLCategory(RedisModuleCtx *ctx, const char *name) { + if (!ctx->module->onload) { + errno = EINVAL; + return REDISMODULE_ERR; + } + + if (moduleVerifyResourceName(name) == REDISMODULE_ERR) { + errno = EINVAL; + return REDISMODULE_ERR; + } + + if (ACLGetCommandCategoryFlagByName(name)) { + errno = EBUSY; + return REDISMODULE_ERR; + } + + if (ACLAddCommandCategory(name, 0)) { + ctx->module->num_acl_categories_added++; + return REDISMODULE_OK; + } else { + errno = ENOMEM; + return REDISMODULE_ERR; + } +} + /* Helper for categoryFlagsFromString(). Attempts to find an acl flag representing the provided flag string * and adds that flag to acl_categories_flags if a match is found. * @@ -2252,6 +2286,7 @@ void RM_SetModuleAttribs(RedisModuleCtx *ctx, const char *name, int ver, int api module->loadmod = NULL; module->num_commands_with_acl_categories = 0; module->onload = 1; + module->num_acl_categories_added = 0; ctx->module = module; } @@ -2294,7 +2329,10 @@ ustime_t RM_CachedMicroseconds(void) { * Within the same command, you can call multiple times * RM_BlockedClientMeasureTimeStart() and RM_BlockedClientMeasureTimeEnd() * to accumulate independent time intervals to the background duration. - * This method always return REDISMODULE_OK. */ + * This method always return REDISMODULE_OK. + * + * This function is not thread safe, If used in module thread and blocked callback (possibly main thread) + * simultaneously, it's recommended to protect them with lock owned by caller instead of GIL. */ int RM_BlockedClientMeasureTimeStart(RedisModuleBlockedClient *bc) { elapsedStart(&(bc->background_timer)); return REDISMODULE_OK; @@ -2304,7 +2342,10 @@ int RM_BlockedClientMeasureTimeStart(RedisModuleBlockedClient *bc) { * to calculate the elapsed execution time. * On success REDISMODULE_OK is returned. * This method only returns REDISMODULE_ERR if no start time was - * previously defined ( meaning RM_BlockedClientMeasureTimeStart was not called ). */ + * previously defined ( meaning RM_BlockedClientMeasureTimeStart was not called ). + * + * This function is not thread safe, If used in module thread and blocked callback (possibly main thread) + * simultaneously, it's recommended to protect them with lock owned by caller instead of GIL. */ int RM_BlockedClientMeasureTimeEnd(RedisModuleBlockedClient *bc) { // If the counter is 0 then we haven't called RM_BlockedClientMeasureTimeStart if (!bc->background_timer) @@ -2363,7 +2404,33 @@ void RM_Yield(RedisModuleCtx *ctx, int flags, const char *busy_reply) { server.busy_module_yield_flags |= BUSY_MODULE_YIELD_CLIENTS; /* Let redis process events */ - processEventsWhileBlocked(); + if (!pthread_equal(server.main_thread_id, pthread_self())) { + /* If we are not in the main thread, we defer event loop processing to the main thread + * after the main thread enters acquiring GIL state in order to protect the event + * loop (ae.c) and avoid potential race conditions. */ + + int acquiring; + atomicGet(server.module_gil_acquring, acquiring); + if (!acquiring) { + /* If the main thread has not yet entered the acquiring GIL state, + * we attempt to wake it up and exit without waiting for it to + * acquire the GIL. This avoids blocking the caller, allowing them to + * continue with unfinished tasks before the next yield. + * We assume the caller keeps the GIL locked. */ + if (write(server.module_pipe[1],"A",1) != 1) { + /* Ignore the error, this is best-effort. */ + } + } else { + /* Release the GIL, yielding CPU to give the main thread an opportunity to start + * event processing, and then acquire the GIL again until the main thread releases it. */ + moduleReleaseGIL(); + usleep(0); + moduleAcquireGIL(); + } + } else { + /* If we are in the main thread, we can safely process events. */ + processEventsWhileBlocked(); + } server.busy_module_yield_reply = prev_busy_module_yield_reply; /* Possibly restore the previous flags in case of two nested contexts @@ -2647,7 +2714,10 @@ RedisModuleString *RM_CreateStringFromStreamID(RedisModuleCtx *ctx, const RedisM * pass ctx as NULL when releasing the string (but passing a context will not * create any issue). Strings created with a context should be freed also passing * the context, so if you want to free a string out of context later, make sure - * to create it using a NULL context. */ + * to create it using a NULL context. + * + * This API is not thread safe, access to these retained strings (if they originated + * from a client command arguments) must be done with GIL locked. */ void RM_FreeString(RedisModuleCtx *ctx, RedisModuleString *str) { decrRefCount(str); if (ctx != NULL) autoMemoryFreed(ctx,REDISMODULE_AM_STRING,str); @@ -2684,7 +2754,10 @@ void RM_FreeString(RedisModuleCtx *ctx, RedisModuleString *str) { * * Threaded modules that reference retained strings from other threads *must* * explicitly trim the allocation as soon as the string is retained. Not doing - * so may result with automatic trimming which is not thread safe. */ + * so may result with automatic trimming which is not thread safe. + * + * This API is not thread safe, access to these retained strings (if they originated + * from a client command arguments) must be done with GIL locked. */ void RM_RetainString(RedisModuleCtx *ctx, RedisModuleString *str) { if (ctx == NULL || !autoMemoryFreed(ctx,REDISMODULE_AM_STRING,str)) { /* Increment the string reference counting only if we can't @@ -2726,7 +2799,10 @@ void RM_RetainString(RedisModuleCtx *ctx, RedisModuleString *str) { * * Threaded modules that reference held strings from other threads *must* * explicitly trim the allocation as soon as the string is held. Not doing - * so may result with automatic trimming which is not thread safe. */ + * so may result with automatic trimming which is not thread safe. + * + * This API is not thread safe, access to these retained strings (if they originated + * from a client command arguments) must be done with GIL locked. */ RedisModuleString* RM_HoldString(RedisModuleCtx *ctx, RedisModuleString *str) { if (str->refcount == OBJ_STATIC_REFCOUNT) { return RM_CreateStringFromString(ctx, str); @@ -3448,9 +3524,7 @@ int RM_ReplyWithLongDouble(RedisModuleCtx *ctx, long double ld) { * * The replicated commands are always wrapped into the MULTI/EXEC that * contains all the commands replicated in a given module command - * execution. However the commands replicated with RedisModule_Call() - * are the first items, the ones replicated with RedisModule_Replicate() - * will all follow before the EXEC. + * execution, in the order they were executed. * * Modules should try to use one interface or the other. * @@ -3472,9 +3546,8 @@ int RM_ReplyWithLongDouble(RedisModuleCtx *ctx, long double ld) { * the callback, and will propagate all the commands wrapped in a MULTI/EXEC * transaction. However when calling this function from a threaded safe context * that can live an undefined amount of time, and can be locked/unlocked in - * at will, the behavior is different: MULTI/EXEC wrapper is not emitted - * and the command specified is inserted in the AOF and replication stream - * immediately. + * at will, it is important to note that this API is not thread-safe and + * must be executed while holding the GIL. * * #### Return value * @@ -3512,15 +3585,18 @@ int RM_Replicate(RedisModuleCtx *ctx, const char *cmdname, const char *fmt, ...) } /* This function will replicate the command exactly as it was invoked - * by the client. Note that this function will not wrap the command into - * a MULTI/EXEC stanza, so it should not be mixed with other replication - * commands. + * by the client. Note that the replicated commands are always wrapped + * into the MULTI/EXEC that contains all the commands replicated in a + * given module command execution, in the order they were executed. * * Basically this form of replication is useful when you want to propagate * the command to the slaves and AOF file exactly as it was called, since * the command can just be re-executed to deterministically re-create the * new state starting from the old one. * + * It is important to note that this API is not thread-safe and + * must be executed while holding the GIL. + * * The function always returns REDISMODULE_OK. */ int RM_ReplicateVerbatim(RedisModuleCtx *ctx) { alsoPropagate(ctx->client->db->id, @@ -4092,7 +4168,7 @@ size_t RM_ValueLength(RedisModuleKey *key) { case OBJ_LIST: return listTypeLength(key->value); case OBJ_SET: return setTypeSize(key->value); case OBJ_ZSET: return zsetLength(key->value); - case OBJ_HASH: return hashTypeLength(key->value); + case OBJ_HASH: return hashTypeLength(key->value, 0); /* OPEN: To subtract expired fields? */ case OBJ_STREAM: return streamLength(key->value); default: return 0; } @@ -4199,7 +4275,7 @@ void RM_ResetDataset(int restart_aof, int async) { /* Returns the number of keys in the current db. */ unsigned long long RM_DbSize(RedisModuleCtx *ctx) { - return dictSize(ctx->client->db->dict); + return dbSize(ctx->client->db); } /* Returns a name of a random key, or NULL if current db is empty. */ @@ -4836,8 +4912,8 @@ int zsetInitScoreRange(RedisModuleKey *key, double min, double max, int minex, i } else if (key->value->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = key->value->ptr; zskiplist *zsl = zs->zsl; - key->u.zset.current = first ? zslFirstInRange(zsl,zrs) : - zslLastInRange(zsl,zrs); + key->u.zset.current = first ? zslNthInRange(zsl,zrs,0) : + zslNthInRange(zsl,zrs,-1); } else { serverPanic("Unsupported zset encoding"); } @@ -4900,8 +4976,8 @@ int zsetInitLexRange(RedisModuleKey *key, RedisModuleString *min, RedisModuleStr } else if (key->value->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = key->value->ptr; zskiplist *zsl = zs->zsl; - key->u.zset.current = first ? zslFirstInLexRange(zsl,zlrs) : - zslLastInLexRange(zsl,zlrs); + key->u.zset.current = first ? zslNthInLexRange(zsl,zlrs,0) : + zslNthInLexRange(zsl,zlrs,-1); } else { serverPanic("Unsupported zset encoding"); } @@ -5195,7 +5271,21 @@ int RM_HashSet(RedisModuleKey *key, int flags, ...) { /* Handle XX and NX */ if (flags & (REDISMODULE_HASH_XX|REDISMODULE_HASH_NX)) { - int exists = hashTypeExists(key->value, field->ptr); + int hfeFlags = HFE_LAZY_AVOID_HASH_DEL; /* Avoid invalidate the key */ + + /* + * The hash might contain expired fields. If we lazily delete expired + * field and the command was sent with XX flag, the operation could + * fail and leave the hash empty, which the caller might not expect. + * To prevent unexpected behavior, we avoid lazy deletion in this case + * yet let the operation fail. Note that moduleDelKeyIfEmpty() + * below won't delete the hash if it left with single expired key + * because hash counts blindly expired fields as well. + */ + if (flags & REDISMODULE_HASH_XX) + hfeFlags |= HFE_LAZY_AVOID_FIELD_DEL; + + int exists = hashTypeExists(key->db, key->value, field->ptr, hfeFlags, NULL); if (((flags & REDISMODULE_HASH_XX) && !exists) || ((flags & REDISMODULE_HASH_NX) && exists)) { @@ -5206,7 +5296,7 @@ int RM_HashSet(RedisModuleKey *key, int flags, ...) { /* Handle deletion if value is REDISMODULE_HASH_DELETE. */ if (value == REDISMODULE_HASH_DELETE) { - count += hashTypeDelete(key->value, field->ptr); + count += hashTypeDelete(key->value, field->ptr, 1); if (flags & REDISMODULE_HASH_CFIELDS) decrRefCount(field); continue; } @@ -5219,8 +5309,8 @@ int RM_HashSet(RedisModuleKey *key, int flags, ...) { low_flags |= HASH_SET_TAKE_FIELD; robj *argv[2] = {field,value}; - hashTypeTryConversion(key->value,argv,0,1); - int updated = hashTypeSet(key->value, field->ptr, value->ptr, low_flags); + hashTypeTryConversion(key->db,key->value,argv,0,1); + int updated = hashTypeSet(key->db, key->value, field->ptr, value->ptr, low_flags); count += (flags & REDISMODULE_HASH_COUNT_ALL) ? 1 : updated; /* If CFIELDS is active, SDS string ownership is now of hashTypeSet(), @@ -5278,6 +5368,7 @@ int RM_HashSet(RedisModuleKey *key, int flags, ...) { * RedisModule_FreeString(), or by enabling automatic memory management. */ int RM_HashGet(RedisModuleKey *key, int flags, ...) { + int hfeFlags = HFE_LAZY_AVOID_FIELD_DEL | HFE_LAZY_AVOID_HASH_DEL; va_list ap; if (key->value && key->value->type != OBJ_HASH) return REDISMODULE_ERR; @@ -5298,14 +5389,17 @@ int RM_HashGet(RedisModuleKey *key, int flags, ...) { /* Query the hash for existence or value object. */ if (flags & REDISMODULE_HASH_EXISTS) { existsptr = va_arg(ap,int*); - if (key->value) - *existsptr = hashTypeExists(key->value,field->ptr); - else + if (key->value) { + *existsptr = hashTypeExists(key->db, key->value, field->ptr, hfeFlags, NULL); + } else { *existsptr = 0; + } } else { valueptr = va_arg(ap,RedisModuleString**); if (key->value) { - *valueptr = hashTypeGetValueObject(key->value,field->ptr); + *valueptr = hashTypeGetValueObject(key->db, key->value, field->ptr, + hfeFlags, NULL); + if (*valueptr) { robj *decoded = getDecodedObject(*valueptr); decrRefCount(*valueptr); @@ -6422,7 +6516,7 @@ RedisModuleCallReply *RM_Call(RedisModuleCtx *ctx, const char *cmdname, const ch c->flags &= ~(CLIENT_READONLY|CLIENT_ASKING); c->flags |= ctx->client->flags & (CLIENT_READONLY|CLIENT_ASKING); if (getNodeByQuery(c,c->cmd,c->argv,c->argc,NULL,&error_code) != - server.cluster->myself) + getMyClusterNode()) { sds msg = NULL; if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { @@ -7602,6 +7696,13 @@ void RM_LatencyAddSample(const char *event, mstime_t latency) { * https://redis.io/topics/modules-blocking-ops. * -------------------------------------------------------------------------- */ +/* Returns 1 if the client already in the moduleUnblocked list, 0 otherwise. */ +int isModuleClientUnblocked(client *c) { + RedisModuleBlockedClient *bc = c->bstate.module_blocked_handle; + + return bc->unblocked == 1; +} + /* This is called from blocked.c in order to unblock a client: may be called * for multiple reasons while the client is in the middle of being blocked * because the client is terminated, but is also called for cleanup when a @@ -7706,15 +7807,15 @@ RedisModuleBlockedClient *moduleBlockClient(RedisModuleCtx *ctx, RedisModuleCmdF bc->background_timer = 0; bc->background_duration = 0; - c->bstate.timeout = 0; + mstime_t timeout = 0; if (timeout_ms) { mstime_t now = mstime(); - if (timeout_ms > LLONG_MAX - now) { + if (timeout_ms > LLONG_MAX - now) { c->bstate.module_blocked_handle = NULL; addReplyError(c, "timeout is out of range"); /* 'timeout_ms+now' would overflow */ return bc; } - c->bstate.timeout = timeout_ms + now; + timeout = timeout_ms + now; } if (islua || ismulti) { @@ -7730,8 +7831,9 @@ RedisModuleBlockedClient *moduleBlockClient(RedisModuleCtx *ctx, RedisModuleCmdF addReplyError(c, "Clients undergoing module based authentication can only be blocked on auth"); } else { if (keys) { - blockForKeys(c,BLOCKED_MODULE,keys,numkeys,c->bstate.timeout,flags&REDISMODULE_BLOCK_UNBLOCK_DELETED); + blockForKeys(c,BLOCKED_MODULE,keys,numkeys,timeout,flags&REDISMODULE_BLOCK_UNBLOCK_DELETED); } else { + c->bstate.timeout = timeout; blockClient(c,BLOCKED_MODULE); } } @@ -8158,7 +8260,7 @@ int RM_UnblockClient(RedisModuleBlockedClient *bc, void *privdata) { * argument, but better to be safe than sorry. */ if (bc->timeout_callback == NULL) return REDISMODULE_ERR; if (bc->unblocked) return REDISMODULE_OK; - if (bc->client) moduleBlockedClientTimedOut(bc->client); + if (bc->client) moduleBlockedClientTimedOut(bc->client, 1); } moduleUnblockClientByHandle(bc,privdata); return REDISMODULE_OK; @@ -8257,8 +8359,10 @@ void moduleHandleBlockedClients(void) { * This needs to be out of the reply callback above given that a * module might not define any callback and still do blocking ops. */ - if (c && !clientHasModuleAuthInProgress(c) && !bc->blocked_on_keys) { - updateStatsOnUnblock(c, bc->background_duration, reply_us, server.stat_total_error_replies != prev_error_replies); + if (c && !clientHasModuleAuthInProgress(c)) { + int had_errors = c->deferred_reply_errors ? !!listLength(c->deferred_reply_errors) : + (server.stat_total_error_replies != prev_error_replies); + updateStatsOnUnblock(c, bc->background_duration, reply_us, had_errors); } if (c != NULL) { @@ -8276,7 +8380,7 @@ void moduleHandleBlockedClients(void) { * if there are pending replies here. This is needed since * during a non blocking command the client may receive output. */ if (!clientHasModuleAuthInProgress(c) && clientHasPendingReplies(c) && - !(c->flags & CLIENT_PENDING_WRITE)) + !(c->flags & CLIENT_PENDING_WRITE) && c->conn) { c->flags |= CLIENT_PENDING_WRITE; listLinkNodeHead(server.clients_pending_write, &c->clients_pending_write_node); @@ -8311,8 +8415,15 @@ int moduleBlockedClientMayTimeout(client *c) { /* Called when our client timed out. After this function unblockClient() * is called, and it will invalidate the blocked client. So this function * does not need to do any cleanup. Eventually the module will call the - * API to unblock the client and the memory will be released. */ -void moduleBlockedClientTimedOut(client *c) { + * API to unblock the client and the memory will be released. + * + * If this function is called from a module, we handle the timeout callback + * and the update of the unblock status in a thread-safe manner to avoid race + * conditions with the main thread. + * If this function is called from the main thread, we must handle the unblocking + * of the client synchronously. This ensures that we can reply to the client before + * resetClient() is called. */ +void moduleBlockedClientTimedOut(client *c, int from_module) { RedisModuleBlockedClient *bc = c->bstate.module_blocked_handle; /* Protect against re-processing: don't serve clients that are already @@ -8321,14 +8432,27 @@ void moduleBlockedClientTimedOut(client *c) { if (bc->unblocked) return; RedisModuleCtx ctx; - moduleCreateContext(&ctx, bc->module, REDISMODULE_CTX_BLOCKED_TIMEOUT); + int flags = REDISMODULE_CTX_BLOCKED_TIMEOUT; + if (from_module) flags |= REDISMODULE_CTX_THREAD_SAFE; + moduleCreateContext(&ctx, bc->module, flags); ctx.client = bc->client; ctx.blocked_client = bc; ctx.blocked_privdata = bc->privdata; - long long prev_error_replies = server.stat_total_error_replies; - bc->timeout_callback(&ctx,(void**)c->argv,c->argc); + + long long prev_error_replies; + if (!from_module) + prev_error_replies = server.stat_total_error_replies; + + if (bc->timeout_callback) { + /* In theory, the user should always pass the timeout handler as an + * argument, but better to be safe than sorry. */ + bc->timeout_callback(&ctx,(void**)c->argv,c->argc); + } + moduleFreeContext(&ctx); - updateStatsOnUnblock(c, bc->background_duration, 0, server.stat_total_error_replies != prev_error_replies); + + if (!from_module) + updateStatsOnUnblock(c, bc->background_duration, 0, server.stat_total_error_replies != prev_error_replies); /* For timeout events, we do not want to call the disconnect callback, * because the blocked client will be automatically disconnected in @@ -8719,11 +8843,12 @@ void moduleNotifyKeyspaceEvent(int type, const char *event, robj *key, int dbid) /* mark the handler as active to avoid reentrant loops. * If the subscriber performs an action triggering itself, * it will not be notified about it. */ + int prev_active = sub->active; sub->active = 1; server.lazy_expire_disabled++; sub->notify_callback(&ctx, type, event, key); server.lazy_expire_disabled--; - sub->active = 0; + sub->active = prev_active; moduleFreeContext(&ctx); } } @@ -8872,23 +8997,7 @@ char **RM_GetClusterNodesList(RedisModuleCtx *ctx, size_t *numnodes) { UNUSED(ctx); if (!server.cluster_enabled) return NULL; - size_t count = dictSize(server.cluster->nodes); - char **ids = zmalloc((count+1)*REDISMODULE_NODE_ID_LEN); - dictIterator *di = dictGetIterator(server.cluster->nodes); - dictEntry *de; - int j = 0; - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - if (node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) continue; - ids[j] = zmalloc(REDISMODULE_NODE_ID_LEN); - memcpy(ids[j],node->name,REDISMODULE_NODE_ID_LEN); - j++; - } - *numnodes = j; - ids[j] = NULL; /* Null term so that FreeClusterNodesList does not need - * to also get the count argument. */ - dictReleaseIterator(di); - return ids; + return getClusterNodesList(numnodes); } /* Free the node list obtained with RedisModule_GetClusterNodesList. */ @@ -8902,7 +9011,7 @@ void RM_FreeClusterNodesList(char **ids) { * is disabled. */ const char *RM_GetMyClusterID(void) { if (!server.cluster_enabled) return NULL; - return server.cluster->myself->name; + return getMyClusterId(); } /* Return the number of nodes in the cluster, regardless of their state @@ -8911,7 +9020,7 @@ const char *RM_GetMyClusterID(void) { * cluster mode, zero is returned. */ size_t RM_GetClusterSize(void) { if (!server.cluster_enabled) return 0; - return dictSize(server.cluster->nodes); + return getClusterSize(); } /* Populate the specified info for the node having as ID the specified 'id', @@ -8938,20 +9047,19 @@ int RM_GetClusterNodeInfo(RedisModuleCtx *ctx, const char *id, char *ip, char *m UNUSED(ctx); clusterNode *node = clusterLookupNode(id, strlen(id)); - if (node == NULL || - node->flags & (CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE)) + if (node == NULL || clusterNodePending(node)) { return REDISMODULE_ERR; } - if (ip) redis_strlcpy(ip,node->ip,NET_IP_STR_LEN); + if (ip) redis_strlcpy(ip, clusterNodeIp(node),NET_IP_STR_LEN); if (master_id) { /* If the information is not available, the function will set the * field to zero bytes, so that when the field can't be populated the * function kinda remains predictable. */ - if (node->flags & CLUSTER_NODE_SLAVE && node->slaveof) - memcpy(master_id,node->slaveof->name,REDISMODULE_NODE_ID_LEN); + if (clusterNodeIsSlave(node) && clusterNodeGetSlaveof(node)) + memcpy(master_id, clusterNodeGetName(clusterNodeGetSlaveof(node)) ,REDISMODULE_NODE_ID_LEN); else memset(master_id,0,REDISMODULE_NODE_ID_LEN); } @@ -8961,12 +9069,12 @@ int RM_GetClusterNodeInfo(RedisModuleCtx *ctx, const char *id, char *ip, char *m * we can provide binary compatibility. */ if (flags) { *flags = 0; - if (node->flags & CLUSTER_NODE_MYSELF) *flags |= REDISMODULE_NODE_MYSELF; - if (node->flags & CLUSTER_NODE_MASTER) *flags |= REDISMODULE_NODE_MASTER; - if (node->flags & CLUSTER_NODE_SLAVE) *flags |= REDISMODULE_NODE_SLAVE; - if (node->flags & CLUSTER_NODE_PFAIL) *flags |= REDISMODULE_NODE_PFAIL; - if (node->flags & CLUSTER_NODE_FAIL) *flags |= REDISMODULE_NODE_FAIL; - if (node->flags & CLUSTER_NODE_NOFAILOVER) *flags |= REDISMODULE_NODE_NOFAILOVER; + if (clusterNodeIsMyself(node)) *flags |= REDISMODULE_NODE_MYSELF; + if (clusterNodeIsMaster(node)) *flags |= REDISMODULE_NODE_MASTER; + if (clusterNodeIsSlave(node)) *flags |= REDISMODULE_NODE_SLAVE; + if (clusterNodeTimedOut(node)) *flags |= REDISMODULE_NODE_PFAIL; + if (clusterNodeIsFailing(node)) *flags |= REDISMODULE_NODE_FAIL; + if (clusterNodeIsNoFailover(node)) *flags |= REDISMODULE_NODE_NOFAILOVER; } return REDISMODULE_OK; } @@ -8997,6 +9105,19 @@ void RM_SetClusterFlags(RedisModuleCtx *ctx, uint64_t flags) { server.cluster_module_flags |= CLUSTER_MODULE_FLAG_NO_REDIRECTION; } +/* Returns the cluster slot of a key, similar to the `CLUSTER KEYSLOT` command. + * This function works even if cluster mode is not enabled. */ +unsigned int RM_ClusterKeySlot(RedisModuleString *key) { + return keyHashSlot(key->ptr, sdslen(key->ptr)); +} + +/* Returns a short string that can be used as a key or as a hash tag in a key, + * such that the key maps to the given cluster slot. Returns NULL if slot is not + * a valid slot. */ +const char *RM_ClusterCanonicalKeyNameInSlot(unsigned int slot) { + return (slot < CLUSTER_SLOTS) ? crc16_slot_table[slot] : NULL; +} + /* -------------------------------------------------------------------------- * ## Modules Timers API * @@ -9102,7 +9223,7 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod while(1) { key = htonu64(expiretime); - if (raxFind(Timers, (unsigned char*)&key,sizeof(key)) == raxNotFound) { + if (!raxFind(Timers, (unsigned char*)&key,sizeof(key),NULL)) { raxInsert(Timers,(unsigned char*)&key,sizeof(key),timer,NULL); break; } else { @@ -9141,8 +9262,11 @@ RedisModuleTimerID RM_CreateTimer(RedisModuleCtx *ctx, mstime_t period, RedisMod * If not NULL, the data pointer is set to the value of the data argument when * the timer was created. */ int RM_StopTimer(RedisModuleCtx *ctx, RedisModuleTimerID id, void **data) { - RedisModuleTimer *timer = raxFind(Timers,(unsigned char*)&id,sizeof(id)); - if (timer == raxNotFound || timer->module != ctx->module) + void *result; + if (!raxFind(Timers,(unsigned char*)&id,sizeof(id),&result)) + return REDISMODULE_ERR; + RedisModuleTimer *timer = result; + if (timer->module != ctx->module) return REDISMODULE_ERR; if (data) *data = timer->data; raxRemove(Timers,(unsigned char*)&id,sizeof(id),NULL); @@ -9157,8 +9281,11 @@ int RM_StopTimer(RedisModuleCtx *ctx, RedisModuleTimerID id, void **data) { * REDISMODULE_OK is returned. The arguments remaining or data can be NULL if * the caller does not need certain information. */ int RM_GetTimerInfo(RedisModuleCtx *ctx, RedisModuleTimerID id, uint64_t *remaining, void **data) { - RedisModuleTimer *timer = raxFind(Timers,(unsigned char*)&id,sizeof(id)); - if (timer == raxNotFound || timer->module != ctx->module) + void *result; + if (!raxFind(Timers,(unsigned char*)&id,sizeof(id),&result)) + return REDISMODULE_ERR; + RedisModuleTimer *timer = result; + if (timer->module != ctx->module) return REDISMODULE_ERR; if (remaining) { int64_t rem = ntohu64(id)-ustime(); @@ -9428,15 +9555,7 @@ void revokeClientAuthentication(client *c) { * is eventually freed we don't rely on the module to still exist. */ moduleNotifyUserChanged(c); - c->user = DefaultUser; - c->authenticated = 0; - /* We will write replies to this client later, so we can't close it - * directly even if async. */ - if (c == server.current_client) { - c->flags |= CLIENT_CLOSE_AFTER_COMMAND; - } else { - freeClientAsync(c); - } + deauthenticateAndCloseClient(c); } /* Cleanup all clients that have been authenticated with this module. This @@ -9926,9 +10045,10 @@ int RM_DictReplace(RedisModuleDict *d, RedisModuleString *key, void *ptr) { * be set by reference to 1 if the key does not exist, or to 0 if the key * exists. */ void *RM_DictGetC(RedisModuleDict *d, void *key, size_t keylen, int *nokey) { - void *res = raxFind(d->rax,key,keylen); - if (nokey) *nokey = (res == raxNotFound); - return (res == raxNotFound) ? NULL : res; + void *res = NULL; + int found = raxFind(d->rax,key,keylen,&res); + if (nokey) *nokey = !found; + return res; } /* Like RedisModule_DictGetC() but takes the key as a RedisModuleString. */ @@ -10350,8 +10470,10 @@ void RM_FreeServerInfo(RedisModuleCtx *ctx, RedisModuleServerInfoData *data) { * mechanism to release the returned string. Return value will be NULL if the * field was not found. */ RedisModuleString *RM_ServerInfoGetField(RedisModuleCtx *ctx, RedisModuleServerInfoData *data, const char* field) { - sds val = raxFind(data->rax, (unsigned char *)field, strlen(field)); - if (val == raxNotFound) return NULL; + void *result; + if (!raxFind(data->rax, (unsigned char *)field, strlen(field), &result)) + return NULL; + sds val = result; RedisModuleString *o = createStringObject(val,sdslen(val)); if (ctx != NULL) autoMemoryAdd(ctx,REDISMODULE_AM_STRING,o); return o; @@ -10359,9 +10481,9 @@ RedisModuleString *RM_ServerInfoGetField(RedisModuleCtx *ctx, RedisModuleServerI /* Similar to RM_ServerInfoGetField, but returns a char* which should not be freed but the caller. */ const char *RM_ServerInfoGetFieldC(RedisModuleServerInfoData *data, const char* field) { - sds val = raxFind(data->rax, (unsigned char *)field, strlen(field)); - if (val == raxNotFound) return NULL; - return val; + void *result = NULL; + raxFind(data->rax, (unsigned char *)field, strlen(field), &result); + return result; } /* Get the value of a field from data collected with RM_GetServerInfo(). If the @@ -10369,11 +10491,12 @@ const char *RM_ServerInfoGetFieldC(RedisModuleServerInfoData *data, const char* * 0, and the optional out_err argument will be set to REDISMODULE_ERR. */ long long RM_ServerInfoGetFieldSigned(RedisModuleServerInfoData *data, const char* field, int *out_err) { long long ll; - sds val = raxFind(data->rax, (unsigned char *)field, strlen(field)); - if (val == raxNotFound) { + void *result; + if (!raxFind(data->rax, (unsigned char *)field, strlen(field), &result)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; } + sds val = result; if (!string2ll(val,sdslen(val),&ll)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; @@ -10387,11 +10510,12 @@ long long RM_ServerInfoGetFieldSigned(RedisModuleServerInfoData *data, const cha * 0, and the optional out_err argument will be set to REDISMODULE_ERR. */ unsigned long long RM_ServerInfoGetFieldUnsigned(RedisModuleServerInfoData *data, const char* field, int *out_err) { unsigned long long ll; - sds val = raxFind(data->rax, (unsigned char *)field, strlen(field)); - if (val == raxNotFound) { + void *result; + if (!raxFind(data->rax, (unsigned char *)field, strlen(field), &result)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; } + sds val = result; if (!string2ull(val,&ll)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; @@ -10405,11 +10529,12 @@ unsigned long long RM_ServerInfoGetFieldUnsigned(RedisModuleServerInfoData *data * optional out_err argument will be set to REDISMODULE_ERR. */ double RM_ServerInfoGetFieldDouble(RedisModuleServerInfoData *data, const char* field, int *out_err) { double dbl; - sds val = raxFind(data->rax, (unsigned char *)field, strlen(field)); - if (val == raxNotFound) { + void *result; + if (!raxFind(data->rax, (unsigned char *)field, strlen(field), &result)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; } + sds val = result; if (!string2d(val,sdslen(val),&dbl)) { if (out_err) *out_err = REDISMODULE_ERR; return 0; @@ -10834,7 +10959,7 @@ typedef struct { } ScanCBData; typedef struct RedisModuleScanCursor{ - unsigned long cursor; + unsigned long long cursor; int done; }RedisModuleScanCursor; @@ -10936,7 +11061,7 @@ int RM_Scan(RedisModuleCtx *ctx, RedisModuleScanCursor *cursor, RedisModuleScanC } int ret = 1; ScanCBData data = { ctx, privdata, fn }; - cursor->cursor = dictScan(ctx->client->db->dict, cursor->cursor, moduleScanCallback, &data); + cursor->cursor = dbScan(ctx->client->db, cursor->cursor, moduleScanCallback, &data); if (cursor->cursor == 0) { cursor->done = 1; ret = 0; @@ -10956,18 +11081,27 @@ static void moduleScanKeyCallback(void *privdata, const dictEntry *de) { ScanKeyCBData *data = privdata; sds key = dictGetKey(de); robj *o = data->key->value; - robj *field = createStringObject(key, sdslen(key)); + robj *field = NULL; robj *value = NULL; if (o->type == OBJ_SET) { value = NULL; } else if (o->type == OBJ_HASH) { sds val = dictGetVal(de); + + /* If field is expired, then ignore */ + if (hfieldIsExpired(key)) + return; + + field = createStringObject(key, hfieldlen(key)); value = createStringObject(val, sdslen(val)); } else if (o->type == OBJ_ZSET) { double *val = (double*)dictGetVal(de); value = createStringObjectFromLongDouble(*val, 0); } + /* if type is OBJ_HASH then key is of type hfield. Otherwise sds. */ + if (!field) field = createStringObject(key, sdslen(key)); + data->fn(data->key, field, value, data->user_data); decrRefCount(field); if (value) decrRefCount(value); @@ -11066,22 +11200,44 @@ int RM_ScanKey(RedisModuleKey *key, RedisModuleScanCursor *cursor, RedisModuleSc cursor->done = 1; ret = 0; } else if (o->type == OBJ_ZSET || o->type == OBJ_HASH) { - unsigned char *p = lpSeek(o->ptr,0); - unsigned char *vstr; - unsigned int vlen; - long long vll; + unsigned char *lp, *p; + /* is hash with expiry on fields, then lp tuples are [field][value][expire] */ + int hfe = o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_LISTPACK_EX; + + if (o->type == OBJ_HASH) + lp = hashTypeListpackGetLp(o); + else + lp = o->ptr; + + p = lpSeek(lp,0); while(p) { - vstr = lpGetValue(p,&vlen,&vll); - robj *field = (vstr != NULL) ? - createStringObject((char*)vstr,vlen) : - createStringObjectFromLongLongWithSds(vll); - p = lpNext(o->ptr,p); - vstr = lpGetValue(p,&vlen,&vll); - robj *value = (vstr != NULL) ? - createStringObject((char*)vstr,vlen) : - createStringObjectFromLongLongWithSds(vll); + long long vllField, vllValue, vllExpire; + unsigned int lenField, lenValue; + unsigned char *pField, *pValue; + + pField = lpGetValue(p,&lenField,&vllField); + p = lpNext(lp,p); + pValue = lpGetValue(p,&lenValue,&vllValue); + p = lpNext(lp,p); + + if (hfe) { + serverAssert(lpGetIntegerValue(p, &vllExpire)); + p = lpNext(lp, p); + + /* Skip expired fields */ + if (hashTypeIsExpired(o, vllExpire)) + continue; + } + + robj *value = (pValue != NULL) ? + createStringObject((char*)pValue,lenValue) : + createStringObjectFromLongLongWithSds(vllValue); + + robj *field = (pField != NULL) ? + createStringObject((char*)pField,lenField) : + createStringObjectFromLongLongWithSds(vllField); fn(key, field, value, privdata); - p = lpNext(o->ptr,p); + decrRefCount(field); decrRefCount(value); } @@ -11093,7 +11249,6 @@ int RM_ScanKey(RedisModuleKey *key, RedisModuleScanCursor *cursor, RedisModuleSc return ret; } - /* -------------------------------------------------------------------------- * ## Module fork API * -------------------------------------------------------------------------- */ @@ -11848,6 +12003,7 @@ void moduleInitModulesSystem(void) { moduleUnblockedClients = listCreate(); server.loadmodule_queue = listCreate(); server.module_configs_queue = dictCreate(&sdsKeyValueHashDictType); + server.module_gil_acquring = 0; modules = dictCreate(&modulesDictType); moduleAuthCallbacks = listCreate(); @@ -11937,6 +12093,13 @@ void moduleRemoveConfigs(RedisModule *module) { } } +/* Remove ACL categories added by the module when it fails to load. */ +void moduleRemoveCateogires(RedisModule *module) { + if (module->num_acl_categories_added) { + ACLCleanupCategoriesOnFailure(module->num_acl_categories_added); + } +} + /* Load all the modules in the server.loadmodule_queue list, which is * populated by `loadmodule` directives in the configuration file. * We can't load modules directly when processing the configuration file @@ -12115,6 +12278,19 @@ int parseLoadexArguments(RedisModuleString ***module_argv, int *module_argc) { return REDISMODULE_OK; } +/* Unregister module-related things, called when moduleLoad fails or moduleUnload. */ +void moduleUnregisterCleanup(RedisModule *module) { + moduleFreeAuthenticatedClients(module); + moduleUnregisterCommands(module); + moduleUnsubscribeNotifications(module); + moduleUnregisterSharedAPI(module); + moduleUnregisterUsedAPI(module); + moduleUnregisterFilters(module); + moduleUnsubscribeAllServerEvents(module); + moduleRemoveConfigs(module); + moduleUnregisterAuthCBs(module); +} + /* Load a module and initialize it. On success C_OK is returned, otherwise * C_ERR is returned. */ int moduleLoad(const char *path, void **module_argv, int module_argc, int is_loadex) { @@ -12149,11 +12325,8 @@ int moduleLoad(const char *path, void **module_argv, int module_argc, int is_loa serverLog(LL_WARNING, "Module %s initialization failed. Module not loaded",path); if (ctx.module) { - moduleUnregisterCommands(ctx.module); - moduleUnregisterSharedAPI(ctx.module); - moduleUnregisterUsedAPI(ctx.module); - moduleRemoveConfigs(ctx.module); - moduleUnregisterAuthCBs(ctx.module); + moduleUnregisterCleanup(ctx.module); + moduleRemoveCateogires(ctx.module); moduleFreeModuleStructure(ctx.module); } moduleFreeContext(&ctx); @@ -12194,8 +12367,6 @@ int moduleLoad(const char *path, void **module_argv, int module_argc, int is_loa } if (post_load_err) { - /* Unregister module auth callbacks (if any exist) that this Module registered onload. */ - moduleUnregisterAuthCBs(ctx.module); moduleUnload(ctx.module->name, NULL); moduleFreeContext(&ctx); return C_ERR; @@ -12253,17 +12424,7 @@ int moduleUnload(sds name, const char **errmsg) { } } - moduleFreeAuthenticatedClients(module); - moduleUnregisterCommands(module); - moduleUnregisterSharedAPI(module); - moduleUnregisterUsedAPI(module); - moduleUnregisterFilters(module); - moduleUnregisterAuthCBs(module); - moduleRemoveConfigs(module); - - /* Remove any notification subscribers this module might have */ - moduleUnsubscribeNotifications(module); - moduleUnsubscribeAllServerEvents(module); + moduleUnregisterCleanup(module); /* Unload the dynamic library. */ if (dlclose(module->handle) == -1) { @@ -12395,7 +12556,7 @@ sds genModulesInfoString(sds info) { * -------------------------------------------------------------------------- */ /* Check if the configuration name is already registered */ -int isModuleConfigNameRegistered(RedisModule *module, sds name) { +int isModuleConfigNameRegistered(RedisModule *module, const char *name) { listNode *match = listSearchKey(module->module_configs, (void *) name); return match != NULL; } @@ -12424,12 +12585,14 @@ int moduleVerifyConfigFlags(unsigned int flags, configType type) { return REDISMODULE_OK; } -int moduleVerifyConfigName(sds name) { - if (sdslen(name) == 0) { - serverLogRaw(LL_WARNING, "Module config names cannot be an empty string."); +/* Verify a module resource or name has only alphanumeric characters, underscores + * or dashes. */ +int moduleVerifyResourceName(const char *name) { + if (name[0] == '\0') { return REDISMODULE_ERR; } - for (size_t i = 0 ; i < sdslen(name) ; ++i) { + + for (size_t i = 0; name[i] != '\0'; i++) { char curr_char = name[i]; if ((curr_char >= 'a' && curr_char <= 'z') || (curr_char >= 'A' && curr_char <= 'Z') || @@ -12438,7 +12601,7 @@ int moduleVerifyConfigName(sds name) { { continue; } - serverLog(LL_WARNING, "Invalid character %c in Module Config name %s.", curr_char, name); + serverLog(LL_WARNING, "Invalid character %c in Module resource name %s.", curr_char, name); return REDISMODULE_ERR; } return REDISMODULE_OK; @@ -12583,21 +12746,21 @@ int moduleConfigApplyConfig(list *module_configs, const char **err, const char * * -------------------------------------------------------------------------- */ /* Create a module config object. */ -ModuleConfig *createModuleConfig(sds name, RedisModuleConfigApplyFunc apply_fn, void *privdata, RedisModule *module) { +ModuleConfig *createModuleConfig(const char *name, RedisModuleConfigApplyFunc apply_fn, void *privdata, RedisModule *module) { ModuleConfig *new_config = zmalloc(sizeof(ModuleConfig)); - new_config->name = sdsdup(name); + new_config->name = sdsnew(name); new_config->apply_fn = apply_fn; new_config->privdata = privdata; new_config->module = module; return new_config; } -int moduleConfigValidityCheck(RedisModule *module, sds name, unsigned int flags, configType type) { +int moduleConfigValidityCheck(RedisModule *module, const char *name, unsigned int flags, configType type) { if (!module->onload) { errno = EBUSY; return REDISMODULE_ERR; } - if (moduleVerifyConfigFlags(flags, type) || moduleVerifyConfigName(name)) { + if (moduleVerifyConfigFlags(flags, type) || moduleVerifyResourceName(name)) { errno = EINVAL; return REDISMODULE_ERR; } @@ -12708,13 +12871,10 @@ unsigned int maskModuleEnumConfigFlags(unsigned int flags) { * * EALREADY: The provided configuration name is already used. */ int RM_RegisterStringConfig(RedisModuleCtx *ctx, const char *name, const char *default_val, unsigned int flags, RedisModuleConfigGetStringFunc getfn, RedisModuleConfigSetStringFunc setfn, RedisModuleConfigApplyFunc applyfn, void *privdata) { RedisModule *module = ctx->module; - sds config_name = sdsnew(name); - if (moduleConfigValidityCheck(module, config_name, flags, NUMERIC_CONFIG)) { - sdsfree(config_name); + if (moduleConfigValidityCheck(module, name, flags, NUMERIC_CONFIG)) { return REDISMODULE_ERR; } - ModuleConfig *new_config = createModuleConfig(config_name, applyfn, privdata, module); - sdsfree(config_name); + ModuleConfig *new_config = createModuleConfig(name, applyfn, privdata, module); new_config->get_fn.get_string = getfn; new_config->set_fn.set_string = setfn; listAddNodeTail(module->module_configs, new_config); @@ -12728,13 +12888,10 @@ int RM_RegisterStringConfig(RedisModuleCtx *ctx, const char *name, const char *d * RedisModule_RegisterStringConfig for detailed information about configs. */ int RM_RegisterBoolConfig(RedisModuleCtx *ctx, const char *name, int default_val, unsigned int flags, RedisModuleConfigGetBoolFunc getfn, RedisModuleConfigSetBoolFunc setfn, RedisModuleConfigApplyFunc applyfn, void *privdata) { RedisModule *module = ctx->module; - sds config_name = sdsnew(name); - if (moduleConfigValidityCheck(module, config_name, flags, BOOL_CONFIG)) { - sdsfree(config_name); + if (moduleConfigValidityCheck(module, name, flags, BOOL_CONFIG)) { return REDISMODULE_ERR; } - ModuleConfig *new_config = createModuleConfig(config_name, applyfn, privdata, module); - sdsfree(config_name); + ModuleConfig *new_config = createModuleConfig(name, applyfn, privdata, module); new_config->get_fn.get_bool = getfn; new_config->set_fn.set_bool = setfn; listAddNodeTail(module->module_configs, new_config); @@ -12774,13 +12931,10 @@ int RM_RegisterBoolConfig(RedisModuleCtx *ctx, const char *name, int default_val * See RedisModule_RegisterStringConfig for detailed general information about configs. */ int RM_RegisterEnumConfig(RedisModuleCtx *ctx, const char *name, int default_val, unsigned int flags, const char **enum_values, const int *int_values, int num_enum_vals, RedisModuleConfigGetEnumFunc getfn, RedisModuleConfigSetEnumFunc setfn, RedisModuleConfigApplyFunc applyfn, void *privdata) { RedisModule *module = ctx->module; - sds config_name = sdsnew(name); - if (moduleConfigValidityCheck(module, config_name, flags, ENUM_CONFIG)) { - sdsfree(config_name); + if (moduleConfigValidityCheck(module, name, flags, ENUM_CONFIG)) { return REDISMODULE_ERR; } - ModuleConfig *new_config = createModuleConfig(config_name, applyfn, privdata, module); - sdsfree(config_name); + ModuleConfig *new_config = createModuleConfig(name, applyfn, privdata, module); new_config->get_fn.get_enum = getfn; new_config->set_fn.set_enum = setfn; configEnum *enum_vals = zmalloc((num_enum_vals + 1) * sizeof(configEnum)); @@ -12802,13 +12956,10 @@ int RM_RegisterEnumConfig(RedisModuleCtx *ctx, const char *name, int default_val * RedisModule_RegisterStringConfig for detailed information about configs. */ int RM_RegisterNumericConfig(RedisModuleCtx *ctx, const char *name, long long default_val, unsigned int flags, long long min, long long max, RedisModuleConfigGetNumericFunc getfn, RedisModuleConfigSetNumericFunc setfn, RedisModuleConfigApplyFunc applyfn, void *privdata) { RedisModule *module = ctx->module; - sds config_name = sdsnew(name); - if (moduleConfigValidityCheck(module, config_name, flags, NUMERIC_CONFIG)) { - sdsfree(config_name); + if (moduleConfigValidityCheck(module, name, flags, NUMERIC_CONFIG)) { return REDISMODULE_ERR; } - ModuleConfig *new_config = createModuleConfig(config_name, applyfn, privdata, module); - sdsfree(config_name); + ModuleConfig *new_config = createModuleConfig(name, applyfn, privdata, module); new_config->get_fn.get_numeric = getfn; new_config->set_fn.set_numeric = setfn; listAddNodeTail(module->module_configs, new_config); @@ -13497,7 +13648,9 @@ void moduleRegisterCoreAPI(void) { REGISTER_API(Alloc); REGISTER_API(TryAlloc); REGISTER_API(Calloc); + REGISTER_API(TryCalloc); REGISTER_API(Realloc); + REGISTER_API(TryRealloc); REGISTER_API(Free); REGISTER_API(Strdup); REGISTER_API(CreateCommand); @@ -13505,6 +13658,7 @@ void moduleRegisterCoreAPI(void) { REGISTER_API(CreateSubcommand); REGISTER_API(SetCommandInfo); REGISTER_API(SetCommandACLCategories); + REGISTER_API(AddACLCategory); REGISTER_API(SetModuleAttribs); REGISTER_API(IsModuleNameBusy); REGISTER_API(WrongArity); @@ -13723,6 +13877,8 @@ void moduleRegisterCoreAPI(void) { REGISTER_API(SetDisconnectCallback); REGISTER_API(GetBlockedClientHandle); REGISTER_API(SetClusterFlags); + REGISTER_API(ClusterKeySlot); + REGISTER_API(ClusterCanonicalKeyNameInSlot); REGISTER_API(CreateDict); REGISTER_API(FreeDict); REGISTER_API(DictSize); diff --git a/src/modules/helloblock.c b/src/modules/helloblock.c index dc3d74975f9..a956c78cebd 100644 --- a/src/modules/helloblock.c +++ b/src/modules/helloblock.c @@ -3,32 +3,11 @@ * * ----------------------------------------------------------------------------- * - * Copyright (c) 2016, Salvatore Sanfilippo + * Copyright (c) 2016-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "../redismodule.h" diff --git a/src/modules/hellocluster.c b/src/modules/hellocluster.c index bc145c2b225..95f468cbe07 100644 --- a/src/modules/hellocluster.c +++ b/src/modules/hellocluster.c @@ -2,32 +2,11 @@ * * ----------------------------------------------------------------------------- * - * Copyright (c) 2018, Salvatore Sanfilippo + * Copyright (c) 2018-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "../redismodule.h" diff --git a/src/modules/hellodict.c b/src/modules/hellodict.c index 12b6e91d254..607c3e31044 100644 --- a/src/modules/hellodict.c +++ b/src/modules/hellodict.c @@ -5,32 +5,11 @@ * * ----------------------------------------------------------------------------- * - * Copyright (c) 2018, Salvatore Sanfilippo + * Copyright (c) 2018-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "../redismodule.h" diff --git a/src/modules/hellohook.c b/src/modules/hellohook.c index 2859a8b26a2..836f43a9bec 100644 --- a/src/modules/hellohook.c +++ b/src/modules/hellohook.c @@ -2,32 +2,11 @@ * * ----------------------------------------------------------------------------- * - * Copyright (c) 2019, Salvatore Sanfilippo + * Copyright (c) 2019-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "../redismodule.h" diff --git a/src/modules/hellotimer.c b/src/modules/hellotimer.c index 67e1e671430..b891c41c367 100644 --- a/src/modules/hellotimer.c +++ b/src/modules/hellotimer.c @@ -2,32 +2,11 @@ * * ----------------------------------------------------------------------------- * - * Copyright (c) 2018, Salvatore Sanfilippo + * Copyright (c) 2018-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "../redismodule.h" diff --git a/src/modules/hellotype.c b/src/modules/hellotype.c index 1dc53d24c16..16343aa4e54 100644 --- a/src/modules/hellotype.c +++ b/src/modules/hellotype.c @@ -7,32 +7,11 @@ * * ----------------------------------------------------------------------------- * - * Copyright (c) 2016, Salvatore Sanfilippo + * Copyright (c) 2016-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "../redismodule.h" diff --git a/src/modules/helloworld.c b/src/modules/helloworld.c index e5179631018..cdbc8c89c37 100644 --- a/src/modules/helloworld.c +++ b/src/modules/helloworld.c @@ -6,32 +6,11 @@ * * ----------------------------------------------------------------------------- * - * Copyright (c) 2016, Salvatore Sanfilippo + * Copyright (c) 2016-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "../redismodule.h" diff --git a/src/monotonic.c b/src/monotonic.c index 1d71962f304..6da03677bda 100644 --- a/src/monotonic.c +++ b/src/monotonic.c @@ -3,10 +3,7 @@ #include #include #include - -#undef NDEBUG -#include - +#include "redisassert.h" /* The function pointer for clock retrieval. */ monotime (*getMonotonicUs)(void) = NULL; diff --git a/src/mstr.c b/src/mstr.c new file mode 100644 index 00000000000..39200d7314b --- /dev/null +++ b/src/mstr.c @@ -0,0 +1,524 @@ +/* + * Copyright Redis Ltd. 2024 - present + * + * Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) + * or the Server Side Public License v1 (SSPLv1). + */ + +#include +#include +#include "sdsalloc.h" +#include "mstr.h" +#include "stdio.h" + +#define NULL_SIZE 1 + +static inline char mstrReqType(size_t string_size); +static inline int mstrHdrSize(char type); +static inline int mstrSumMetaLen(mstrKind *k, mstrFlags flags); +static inline size_t mstrAllocLen(const mstr s, struct mstrKind *kind); + +/*** mstr API ***/ + +/* Create mstr without any metadata attached, based on string 'initStr'. + * - If initStr equals NULL, then only allocation will be made. + * - string of mstr is always null-terminated. + */ +mstr mstrNew(const char *initStr, size_t lenStr, int trymalloc) { + unsigned char *pInfo; /* pointer to mstr info field */ + void *sh; + mstr s; + char type = mstrReqType(lenStr); + int mstrHdr = mstrHdrSize(type); + + assert(lenStr + mstrHdr + 1 > lenStr); /* Catch size_t overflow */ + + size_t len = mstrHdr + lenStr + NULL_SIZE; + sh = trymalloc? s_trymalloc(len) : s_malloc(len); + + if (sh == NULL) return NULL; + + s = (char*)sh + mstrHdr; + pInfo = ((unsigned char*)s) - 1; + + switch(type) { + case MSTR_TYPE_5: { + *pInfo = CREATE_MSTR_INFO(lenStr, 0 /*ismeta*/, type); + break; + } + case MSTR_TYPE_8: { + MSTR_HDR_VAR(8,s); + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 0 /*ismeta*/, type); + sh->len = lenStr; + break; + } + case MSTR_TYPE_16: { + MSTR_HDR_VAR(16,s); + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 0 /*ismeta*/, type); + sh->len = lenStr; + break; + } + case MSTR_TYPE_64: { + MSTR_HDR_VAR(64,s); + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 0 /*ismeta*/, type); + sh->len = lenStr; + break; + } + } + + if (initStr && lenStr) + memcpy(s, initStr, lenStr); + + s[lenStr] = '\0'; + return s; +} + +/* Creates mstr with given string. Reserve space for metadata. + * + * Note: mstrNew(s,l) and mstrNewWithMeta(s,l,0) are not the same. The first allocates + * just string. The second allocates a string with flags (yet without any metadata + * structures allocated). + */ +mstr mstrNewWithMeta(struct mstrKind *kind, const char *initStr, size_t lenStr, mstrFlags metaFlags, int trymalloc) { + unsigned char *pInfo; /* pointer to mstr info field */ + char *allocMstr; + mstr mstrPtr; + char type = mstrReqType(lenStr); + int mstrHdr = mstrHdrSize(type); + int sumMetaLen = mstrSumMetaLen(kind, metaFlags); + + + /* mstrSumMetaLen() + sizeof(mstrFlags) + sizeof(mstrhdrX) + lenStr */ + + size_t allocLen = sumMetaLen + sizeof(mstrFlags) + mstrHdr + lenStr + NULL_SIZE; + allocMstr = trymalloc? s_trymalloc(allocLen) : s_malloc(allocLen); + + if (allocMstr == NULL) return NULL; + + /* metadata is located at the beginning of the allocation, then meta-flags and lastly the string */ + mstrFlags *pMetaFlags = (mstrFlags *) (allocMstr + sumMetaLen) ; + mstrPtr = ((char*) pMetaFlags) + sizeof(mstrFlags) + mstrHdr; + pInfo = ((unsigned char*)mstrPtr) - 1; + + switch(type) { + case MSTR_TYPE_5: { + *pInfo = CREATE_MSTR_INFO(lenStr, 1 /*ismeta*/, type); + break; + } + case MSTR_TYPE_8: { + MSTR_HDR_VAR(8, mstrPtr); + sh->len = lenStr; + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 1 /*ismeta*/, type); + break; + } + case MSTR_TYPE_16: { + MSTR_HDR_VAR(16, mstrPtr); + sh->len = lenStr; + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 1 /*ismeta*/, type); + break; + } + case MSTR_TYPE_64: { + MSTR_HDR_VAR(64, mstrPtr); + sh->len = lenStr; + *pInfo = CREATE_MSTR_INFO(0 /*unused*/, 1 /*ismeta*/, type); + break; + } + } + *pMetaFlags = metaFlags; + if (initStr != NULL) memcpy(mstrPtr, initStr, lenStr); + mstrPtr[lenStr] = '\0'; + + return mstrPtr; +} + +/* Create copy of mstr. Flags can be modified. For each metadata flag, if + * same flag is set on both, then copy its metadata. */ +mstr mstrNewCopy(struct mstrKind *kind, mstr src, mstrFlags newFlags) { + mstr dst; + + /* if no flags are set, then just copy the string */ + if (newFlags == 0) return mstrNew(src, mstrlen(src), 0); + + dst = mstrNewWithMeta(kind, src, mstrlen(src), newFlags, 0); + memcpy(dst, src, mstrlen(src) + 1); + + /* if metadata is attached to src, then selectively copy metadata */ + if (mstrIsMetaAttached(src)) { + mstrFlags *pFlags1 = mstrFlagsRef(src), + *pFlags2 = mstrFlagsRef(dst); + + mstrFlags flags1Shift = *pFlags1, + flags2Shift = *pFlags2; + + unsigned char *at1 = ((unsigned char *) pFlags1), + *at2 = ((unsigned char *) pFlags2); + + /* if the flag is set on both, then copy the metadata */ + for (int i = 0; flags1Shift != 0; ++i) { + int isFlag1Set = flags1Shift & 0x1; + int isFlag2Set = flags2Shift & 0x1; + + if (isFlag1Set) at1 -= kind->metaSize[i]; + if (isFlag2Set) at2 -= kind->metaSize[i]; + + if (isFlag1Set && isFlag2Set) + memcpy(at2, at1, kind->metaSize[i]); + flags1Shift >>= 1; + flags2Shift >>= 1; + } + } + return dst; +} + +/* Free mstring. Note, mstrKind is required to eval sizeof metadata and find start + * of allocation but if mstrIsMetaAttached(s) is false, you can pass NULL as well. + */ +void mstrFree(struct mstrKind *kind, mstr s) { + if (s != NULL) + s_free(mstrGetAllocPtr(kind, s)); +} + +/* return ref to metadata flags. Useful to modify directly flags which doesn't + * include metadata payload */ +mstrFlags *mstrFlagsRef(mstr s) { + switch(s[-1]&MSTR_TYPE_MASK) { + case MSTR_TYPE_5: + return ((mstrFlags *) (s - sizeof(struct mstrhdr5))) - 1; + case MSTR_TYPE_8: + return ((mstrFlags *) (s - sizeof(struct mstrhdr8))) - 1; + case MSTR_TYPE_16: + return ((mstrFlags *) (s - sizeof(struct mstrhdr16))) - 1; + default: /* MSTR_TYPE_64: */ + return ((mstrFlags *) (s - sizeof(struct mstrhdr64))) - 1; + } +} + +/* Return a reference to corresponding metadata of the specified metadata flag + * index (flagIdx). If the metadata doesn't exist, it still returns a reference + * to the starting location where it would have been written among other metadatas. + * To verify if `flagIdx` of some metadata is attached, use `mstrGetFlag(s, flagIdx)`. + */ +void *mstrMetaRef(mstr s, struct mstrKind *kind, int flagIdx) { + int metaOffset = 0; + /* start iterating from flags backward */ + mstrFlags *pFlags = mstrFlagsRef(s); + mstrFlags tmp = *pFlags; + + for (int i = 0 ; i <= flagIdx ; ++i) { + if (tmp & 0x1) metaOffset += kind->metaSize[i]; + tmp >>= 1; + } + return ((char *)pFlags) - metaOffset; +} + +/* mstr layout: [meta-data#N]...[meta-data#0][mstrFlags][mstrhdr][string][null] */ +void *mstrGetAllocPtr(struct mstrKind *kind, mstr str) { + if (!mstrIsMetaAttached(str)) + return (char*)str - mstrHdrSize(str[-1]); + + int totalMetaLen = mstrSumMetaLen(kind, *mstrFlagsRef(str)); + return (char*)str - mstrHdrSize(str[-1]) - sizeof(mstrFlags) - totalMetaLen; +} + +/* Prints in the following fashion: + * [0x7f8bd8816017] my_mstr: foo (strLen=3, mstrLen=11, isMeta=1, metaFlags=0x1) + * [0x7f8bd8816010] >> meta[0]: 0x78 0x56 0x34 0x12 (metaLen=4) + */ +void mstrPrint(mstr s, struct mstrKind *kind, int verbose) { + mstrFlags mflags, tmp; + int isMeta = mstrIsMetaAttached(s); + + tmp = mflags = (isMeta) ? *mstrFlagsRef(s) : 0; + + if (!isMeta) { + printf("[%p] %s: %s (strLen=%zu, mstrLen=%zu, isMeta=0)\n", + (void *)s, kind->name, s, mstrlen(s), mstrAllocLen(s, kind)); + return; + } + + printf("[%p] %s: %s (strLen=%zu, mstrLen=%zu, isMeta=1, metaFlags=0x%x)\n", + (void *)s, kind->name, s, mstrlen(s), mstrAllocLen(s, kind), mflags); + + if (verbose) { + for (unsigned int i = 0 ; i < NUM_MSTR_FLAGS ; ++i) { + if (tmp & 0x1) { + int mSize = kind->metaSize[i]; + void *mRef = mstrMetaRef(s, kind, i); + printf("[%p] >> meta[%d]:", mRef, i); + for (int j = 0 ; j < mSize ; ++j) { + printf(" 0x%02x", ((unsigned char *) mRef)[j]); + } + printf(" (metaLen=%d)\n", mSize); + } + tmp >>= 1; + } + } +} + +/* return length of the string (ignoring metadata attached) */ +size_t mstrlen(const mstr s) { + unsigned char info = s[-1]; + switch(info & MSTR_TYPE_MASK) { + case MSTR_TYPE_5: + return MSTR_TYPE_5_LEN(info); + case MSTR_TYPE_8: + return MSTR_HDR(8,s)->len; + case MSTR_TYPE_16: + return MSTR_HDR(16,s)->len; + default: /* MSTR_TYPE_64: */ + return MSTR_HDR(64,s)->len; + } +} + +/*** mstr internals ***/ + +static inline int mstrSumMetaLen(mstrKind *k, mstrFlags flags) { + int total = 0; + int i = 0 ; + while (flags) { + total += (flags & 0x1) ? k->metaSize[i] : 0; + flags >>= 1; + ++i; + } + return total; +} + +/* mstrSumMetaLen() + sizeof(mstrFlags) + sizeof(mstrhdrX) + strlen + '\0' */ +static inline size_t mstrAllocLen(const mstr s, struct mstrKind *kind) { + int hdrlen; + mstrFlags *pMetaFlags; + size_t strlen = 0; + + int isMeta = mstrIsMetaAttached(s); + unsigned char info = s[-1]; + + switch(info & MSTR_TYPE_MASK) { + case MSTR_TYPE_5: + strlen = MSTR_TYPE_5_LEN(info); + hdrlen = sizeof(struct mstrhdr5); + pMetaFlags = ((mstrFlags *) MSTR_HDR(5, s)) - 1; + break; + case MSTR_TYPE_8: + strlen = MSTR_HDR(8,s)->len; + hdrlen = sizeof(struct mstrhdr8); + pMetaFlags = ((mstrFlags *) MSTR_HDR(8, s)) - 1; + break; + case MSTR_TYPE_16: + strlen = MSTR_HDR(16,s)->len; + hdrlen = sizeof(struct mstrhdr16); + pMetaFlags = ((mstrFlags *) MSTR_HDR(16, s)) - 1; + break; + default: /* MSTR_TYPE_64: */ + strlen = MSTR_HDR(64,s)->len; + hdrlen = sizeof(struct mstrhdr64); + pMetaFlags = ((mstrFlags *) MSTR_HDR(64, s)) - 1; + break; + } + return hdrlen + strlen + NULL_SIZE + ((isMeta) ? (mstrSumMetaLen(kind, *pMetaFlags) + sizeof(mstrFlags)) : 0); +} + +/* returns pointer to the beginning of malloc() of mstr */ +void *mstrGetStartAlloc(mstr s, struct mstrKind *kind) { + int hdrlen; + mstrFlags *pMetaFlags; + + int isMeta = mstrIsMetaAttached(s); + + switch(s[-1]&MSTR_TYPE_MASK) { + case MSTR_TYPE_5: + hdrlen = sizeof(struct mstrhdr5); + pMetaFlags = ((mstrFlags *) MSTR_HDR(5, s)) - 1; + break; + case MSTR_TYPE_8: + hdrlen = sizeof(struct mstrhdr8); + pMetaFlags = ((mstrFlags *) MSTR_HDR(8, s)) - 1; + break; + case MSTR_TYPE_16: + hdrlen = sizeof(struct mstrhdr16); + pMetaFlags = ((mstrFlags *) MSTR_HDR(16, s)) - 1; + break; + default: /* MSTR_TYPE_64: */ + hdrlen = sizeof(struct mstrhdr64); + pMetaFlags = ((mstrFlags *) MSTR_HDR(64, s)) - 1; + break; + } + return (char *) s - hdrlen - ((isMeta) ? (mstrSumMetaLen(kind, *pMetaFlags) + sizeof(mstrFlags)) : 0); +} + +static inline int mstrHdrSize(char type) { + switch(type&MSTR_TYPE_MASK) { + case MSTR_TYPE_5: + return sizeof(struct mstrhdr5); + case MSTR_TYPE_8: + return sizeof(struct mstrhdr8); + case MSTR_TYPE_16: + return sizeof(struct mstrhdr16); + case MSTR_TYPE_64: + return sizeof(struct mstrhdr64); + } + return 0; +} + +static inline char mstrReqType(size_t string_size) { + if (string_size < 1<<5) + return MSTR_TYPE_5; + if (string_size < 1<<8) + return MSTR_TYPE_8; + if (string_size < 1<<16) + return MSTR_TYPE_16; + return MSTR_TYPE_64; +} + +#ifdef REDIS_TEST +#include +#include +#include "testhelp.h" +#include "limits.h" + +#ifndef UNUSED +#define UNUSED(x) (void)(x) +#endif + +/* Challenge mstr with metadata interesting enough that can include the case of hfield and hkey and more */ +#define B(idx) (1<<(idx)) + +#define META_IDX_MYMSTR_TTL4 0 +#define META_IDX_MYMSTR_TTL8 1 +#define META_IDX_MYMSTR_TYPE_ENC_LRU 2 // 4Bbit type, 4bit encoding, 24bits lru +#define META_IDX_MYMSTR_VALUE_PTR 3 +#define META_IDX_MYMSTR_FLAG_NO_META 4 + +#define TEST_CONTEXT(context) printf("\nContext: %s \n", context); + +int mstrTest(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + struct mstrKind kind_mymstr = { + .name = "my_mstr", + .metaSize[META_IDX_MYMSTR_TTL4] = 4, + .metaSize[META_IDX_MYMSTR_TTL8] = 8, + .metaSize[META_IDX_MYMSTR_TYPE_ENC_LRU] = 4, + .metaSize[META_IDX_MYMSTR_VALUE_PTR] = 8, + .metaSize[META_IDX_MYMSTR_FLAG_NO_META] = 0, + }; + + TEST_CONTEXT("Create simple short mstr") + { + char *str = "foo"; + mstr s = mstrNew(str, strlen(str), 0); + size_t expStrLen = strlen(str); + + test_cond("Verify str length and alloc length", + mstrAllocLen(s, NULL) == (1 + expStrLen + 1) && /* mstrhdr5 + str + null */ + mstrlen(s) == expStrLen && /* expected strlen(str) */ + memcmp(s, str, expStrLen + 1) == 0); + mstrFree(&kind_mymstr, s); + } + + TEST_CONTEXT("Create simple 40 bytes mstr") + { + char *str = "0123456789012345678901234567890123456789"; // 40 bytes + mstr s = mstrNew(str, strlen(str), 0); + + test_cond("Verify str length and alloc length", + mstrAllocLen(s, NULL) == (3 + 40 + 1) && /* mstrhdr8 + str + null */ + mstrlen(s) == 40 && + memcmp(s,str,40) == 0); + mstrFree(&kind_mymstr, s); + } + + TEST_CONTEXT("Create mstr with random characters") + { + long unsigned int i; + char str[66000]; + for (i = 0 ; i < sizeof(str) ; ++i) str[i] = rand() % 256; + + size_t len[] = { 31, 32, 33, 255, 256, 257, 65535, 65536, 65537, 66000}; + for (i = 0 ; i < sizeof(len) / sizeof(len[0]) ; ++i) { + char title[100]; + mstr s = mstrNew(str, len[i], 0); + size_t mstrhdrSize = (len[i] < 1<<5) ? sizeof(struct mstrhdr5) : + (len[i] < 1<<8) ? sizeof(struct mstrhdr8) : + (len[i] < 1<<16) ? sizeof(struct mstrhdr16) : + sizeof(struct mstrhdr64); + + snprintf(title, sizeof(title), "Verify string of length %zu", len[i]); + test_cond(title, + mstrAllocLen(s, NULL) == (mstrhdrSize + len[i] + 1) && /* mstrhdrX + str + null */ + mstrlen(s) == len[i] && + memcmp(s,str,len[i]) == 0); + mstrFree(&kind_mymstr, s); + } + } + + TEST_CONTEXT("Create short mstr with TTL4") + { + uint32_t *ttl; + mstr s = mstrNewWithMeta(&kind_mymstr, + "foo", + strlen("foo"), + B(META_IDX_MYMSTR_TTL4), /* allocate with TTL4 metadata */ + 0); + + ttl = mstrMetaRef(s, &kind_mymstr, META_IDX_MYMSTR_TTL4); + *ttl = 0x12345678; + + test_cond("Verify memory-allocation and string lengths", + mstrAllocLen(s, &kind_mymstr) == (1 + 3 + 2 + 1 + 4) && /* mstrhdr5 + str + null + mstrFlags + TLL */ + mstrlen(s) == 3); + + unsigned char expMem[] = {0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x00, 0x1c, 'f', 'o', 'o', '\0' }; + uint32_t value = 0x12345678; + memcpy(expMem, &value, sizeof(uint32_t)); + test_cond("Verify string and TTL4 payload", memcmp( + mstrMetaRef(s, &kind_mymstr, 0) , expMem, sizeof(expMem)) == 0); + + test_cond("Verify mstrIsMetaAttached() function works", mstrIsMetaAttached(s) != 0); + + mstrFree(&kind_mymstr, s); + } + + TEST_CONTEXT("Create short mstr with TTL4 and value ptr ") + { + mstr s = mstrNewWithMeta(&kind_mymstr, "foo", strlen("foo"), + B(META_IDX_MYMSTR_TTL4) | B(META_IDX_MYMSTR_VALUE_PTR), 0); + *((uint32_t *) (mstrMetaRef(s, &kind_mymstr, + META_IDX_MYMSTR_TTL4))) = 0x12345678; + + test_cond("Verify length and alloc length", + mstrAllocLen(s, &kind_mymstr) == (1 + 3 + 1 + 2 + 4 + 8) && /* mstrhdr5 + str + null + mstrFlags + TLL + PTR */ + mstrlen(s) == 3); + mstrFree(&kind_mymstr, s); + } + + TEST_CONTEXT("Copy mstr and add it TTL4") + { + mstr s1 = mstrNew("foo", strlen("foo"), 0); + mstr s2 = mstrNewCopy(&kind_mymstr, s1, B(META_IDX_MYMSTR_TTL4)); + *((uint32_t *) (mstrMetaRef(s2, &kind_mymstr, META_IDX_MYMSTR_TTL4))) = 0x12345678; + + test_cond("Verify new mstr includes TTL4", + mstrAllocLen(s2, &kind_mymstr) == (1 + 3 + 1 + 2 + 4) && /* mstrhdr5 + str + null + mstrFlags + TTL4 */ + mstrlen(s2) == 3 && /* 'foo' = 3bytes */ + memcmp(s2, "foo\0", 4) == 0); + + mstr s3 = mstrNewCopy(&kind_mymstr, s2, B(META_IDX_MYMSTR_TTL4)); + unsigned char expMem[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0x1, 0x0, 0x1c, 'f', 'o', 'o', '\0' }; + uint32_t value = 0x12345678; + memcpy(expMem, &value, sizeof(uint32_t)); + + char *ppp = mstrGetStartAlloc(s3, &kind_mymstr); + test_cond("Verify string and TTL4 payload", + memcmp(ppp, expMem, sizeof(expMem)) == 0); + + mstrPrint(s3, &kind_mymstr, 1); + mstrFree(&kind_mymstr, s1); + mstrFree(&kind_mymstr, s2); + mstrFree(&kind_mymstr, s3); + } + + return 0; +} +#endif diff --git a/src/mstr.h b/src/mstr.h new file mode 100644 index 00000000000..1613a637ec6 --- /dev/null +++ b/src/mstr.h @@ -0,0 +1,226 @@ +/* + * Copyright Redis Ltd. 2024 - present + * + * Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) + * or the Server Side Public License v1 (SSPLv1). + * + * + * WHAT IS MSTR (M-STRING)? + * ------------------------ + * mstr stands for immutable string with optional metadata attached. + * + * sds string is widely used across the system and serves as a general purpose + * container to hold data. The need to optimize memory and aggregate strings + * along with metadata and store it into Redis data-structures as single bulk keep + * reoccur. One thought might be, why not to extend sds to support metadata. The + * answer is that sds is mutable string in its nature, with wide API (split, join, + * etc.). Pushing metadata logic into sds will make it very fragile, and complex + * to maintain. + * + * Another idea involved using a simple struct with flags and a dynamic buf[] at the + * end. While this could be viable, it introduces considerable complexity and would + * need maintenance across different contexts. + * + * As an alternative, we introduce a new implementation of immutable strings, + * with limited API, and with the option to attach metadata. The representation + * of the string, without any metadata, in its basic form, resembles SDS but + * without the API to manipulate the string. Only to attach metadata to it. The + * following diagram shows the memory layout of mstring (mstrhdr8) when no + * metadata is attached: + * + * +----------------------------------------------+ + * | mstrhdr8 | c-string | | + * +--------------------------------+-------------+ + * |8b |2b |1b |5b |?bytes |8b| + * | Len | Type |m-bit=0 | Unused | String |\0| + * +----------------------------------------------+ + * ^ + * | + * mstrNew() returns pointer to here --+ + * + * If metadata-flag is set, depicted in diagram above as m-bit in the diagram, + * then the header will be preceded with additional 16 bits of metadata flags such + * that if i'th bit is set, then the i'th metadata structure is attached to the + * mstring. The metadata layout and their sizes are defined by mstrKind structure + * (More below). + * + * The following diagram shows the memory layout of mstr (mstrhdr8) when 3 bits in mFlags + * are set to indicate that 3 fields of metadata are attached to the mstring at the + * beginning. + * + * +-------------------------------------------------------------------------------+ + * | METADATA FIELDS | mflags | mstrhdr8 | c-string | | + * +-----------------------+--------+--------------------------------+-------------+ + * |?bytes |?bytes |?bytes |16b |8b |2b |1b |5b |?bytes |8b| + * | Meta3 | Meta2 | Meta0 | 0x1101 | Len | Type |m-bit=1 | Unused | String |\0| + * +-------------------------------------------------------------------------------+ + * ^ + * | + * mstrNewWithMeta() returns pointer to here --+ + * + * mstr allows to define different kinds (groups) of mstrings, each with its + * own unique metadata layout. For example, in case of hash-fields, all instances of + * it can optionally have TTL metadata attached to it. This is achieved by first + * prototyping a single mstrKind structure that defines the metadata layout and sizes + * of this specific kind. Now each hash-field instance has still the freedom to + * attach or not attach the metadata to it, and metadata flags (mFlags) of the + * instance will reflect this decision. + * + * In the future, the keys of Redis keyspace can be another kind of mstring that + * has TTL, LRU or even dictEntry metadata embedded into. Unlike vptr in c++, this + * struct won't be attached to mstring but will be passed as yet another argument + * to API, to save memory. In addition, each instance of a given mstrkind can hold + * any subset of metadata and the 8 bits of metadata-flags will reflect it. + * + * The following example shows how to define mstrKind for possible future keyspace + * that aggregates several keyspace related metadata into one compact, singly + * allocated, mstring. + * + * typedef enum HkeyMetaFlags { + * HKEY_META_VAL_REF_COUNT = 0, // refcount + * HKEY_META_VAL_REF = 1, // Val referenced + * HKEY_META_EXPIRE = 2, // TTL and more + * HKEY_META_TYPE_ENC_LRU = 3, // TYPE + LRU + ENC + * HKEY_META_DICT_ENT_NEXT = 4, // Next dict entry + * // Following two must be together and in this order + * HKEY_META_VAL_EMBED8 = 5, // Val embedded, max 7 bytes + * HKEY_META_VAL_EMBED16 = 6, // Val embedded, max 15 bytes (23 with EMBED8) + * } HkeyMetaFlags; + * + * mstrKind hkeyKind = { + * .name = "hkey", + * .metaSize[HKEY_META_VAL_REF_COUNT] = 4, + * .metaSize[HKEY_META_VAL_REF] = 8, + * .metaSize[HKEY_META_EXPIRE] = sizeof(ExpireMeta), + * .metaSize[HKEY_META_TYPE_ENC_LRU] = 8, + * .metaSize[HKEY_META_DICT_ENT_NEXT] = 8, + * .metaSize[HKEY_META_VAL_EMBED8] = 8, + * .metaSize[HKEY_META_VAL_EMBED16] = 16, + * }; + * + * MSTR-ALIGNMENT + * -------------- + * There are two types of alignments to take into consideration: + * 1. Alignment of the metadata. + * 2. Alignment of returned mstr pointer + * + * 1) As the metadatas layout are reversed to their enumeration, it is recommended + * to put metadata with "better" alignment first in memory layout (enumerated + * last) and the worst, or those that simply don't require any alignment will be + * last in memory layout (enumerated first). This is similar the to the applied + * consideration when defining new struct in C. Note also that each metadata + * might either be attached to mstr or not which complicates the design phase + * of a new mstrKind a little. + * + * In the example above, HKEY_META_VAL_REF_COUNT, with worst alignment of 4 + * bytes, is enumerated first, and therefore, will be last in memory layout. + * + * 2) Few optimizations in Redis rely on the fact that sds address is always an odd + * pointer. We can achieve the same with a little effort. It was already taken + * care that all headers of type mstrhdrX has odd size. With that in mind, if + * a new kind of mstr is required to be limited to odd addresses, then we must + * make sure that sizes of all related metadatas that are defined in mstrKind + * are even in size. + */ + +#ifndef __MSTR_H +#define __MSTR_H + +#include +#include +#include + +/* Selective copy of ifndef from server.h instead of including it */ +#ifndef static_assert +#define static_assert(expr, lit) extern char __static_assert_failure[(expr) ? 1:-1] +#endif + +#define MSTR_TYPE_5 0 +#define MSTR_TYPE_8 1 +#define MSTR_TYPE_16 2 +#define MSTR_TYPE_64 3 +#define MSTR_TYPE_MASK 3 +#define MSTR_TYPE_BITS 2 + +#define MSTR_META_MASK 4 + +#define MSTR_HDR(T,s) ((struct mstrhdr##T *)((s)-(sizeof(struct mstrhdr##T)))) +#define MSTR_HDR_VAR(T,s) struct mstrhdr##T *sh = (void*)((s)-(sizeof(struct mstrhdr##T))); + +#define MSTR_META_BITS 1 /* is metadata attached? */ +#define MSTR_TYPE_5_LEN(f) ((f) >> (MSTR_TYPE_BITS + MSTR_META_BITS)) +#define CREATE_MSTR_INFO(len, ismeta, type) ( (((len< + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -303,6 +282,8 @@ void watchForKey(client *c, robj *key) { listNode *ln; watchedKey *wk; + if (listLength(c->watched_keys) == 0) server.watching_clients++; + /* Check if we are already watching for this key */ listRewind(c->watched_keys,&li); while((ln = listNext(&li))) { @@ -353,6 +334,7 @@ void unwatchAllKeys(client *c) { decrRefCount(wk->key); zfree(wk); } + server.watching_clients--; } /* Iterates over the watched_keys list and looks for an expired key. Keys which @@ -394,7 +376,7 @@ void touchWatchedKey(redisDb *db, robj *key) { /* The key was already expired when WATCH was called. */ if (db == wk->db && equalStringObjects(key, wk->key) && - dictFind(db->dict, key->ptr) == NULL) + dbFind(db, key->ptr) == NULL) { /* Already expired key is deleted, so logically no change. Clear * the flag. Deleted keys are not flagged as expired. */ @@ -432,9 +414,9 @@ void touchAllWatchedKeysInDb(redisDb *emptied, redisDb *replaced_with) { dictIterator *di = dictGetSafeIterator(emptied->watched_keys); while((de = dictNext(di)) != NULL) { robj *key = dictGetKey(de); - int exists_in_emptied = dictFind(emptied->dict, key->ptr) != NULL; + int exists_in_emptied = dbFind(emptied, key->ptr) != NULL; if (exists_in_emptied || - (replaced_with && dictFind(replaced_with->dict, key->ptr))) + (replaced_with && dbFind(replaced_with, key->ptr) != NULL)) { list *clients = dictGetVal(de); if (!clients) continue; @@ -442,7 +424,7 @@ void touchAllWatchedKeysInDb(redisDb *emptied, redisDb *replaced_with) { while((ln = listNext(&li))) { watchedKey *wk = redis_member2struct(watchedKey, node, ln); if (wk->expired) { - if (!replaced_with || !dictFind(replaced_with->dict, key->ptr)) { + if (!replaced_with || !dbFind(replaced_with, key->ptr)) { /* Expired key now deleted. No logical change. Clear the * flag. Deleted keys are not flagged as expired. */ wk->expired = 0; diff --git a/src/networking.c b/src/networking.c index 56273fc7e52..be5fa06942b 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -32,6 +11,7 @@ #include "cluster.h" #include "script.h" #include "fpconv_dtoa.h" +#include "fmtargs.h" #include #include #include @@ -51,6 +31,14 @@ size_t sdsZmallocSize(sds s) { return zmalloc_size(sh); } +/* Return the size consumed from the allocator, for the specified hfield with + * metadata (mstr), including internal fragmentation. This function is used in + * order to compute the client output buffer size. */ +size_t hfieldZmallocSize(hfield s) { + void *sh = hfieldGetAllocPtr(s); + return zmalloc_size(sh); +} + /* Return the amount of memory used by the sds string at object->ptr * for a string object. This includes internal fragmentation. */ size_t getStringObjectSdsUsedMemory(robj *o) { @@ -413,8 +401,9 @@ void _addReplyToBufferOrList(client *c, const char *s, size_t len) { * to a channel which we are subscribed to, then we wanna postpone that message to be added * after the command's reply (specifically important during multi-exec). the exception is * the SUBSCRIBE command family, which (currently) have a push message instead of a proper reply. - * The check for executing_client also avoids affecting push messages that are part of eviction. */ - if (c == server.current_client && (c->flags & CLIENT_PUSHING) && + * The check for executing_client also avoids affecting push messages that are part of eviction. + * Check CLIENT_PUSHING first to avoid race conditions, as it's absent in module's fake client. */ + if ((c->flags & CLIENT_PUSHING) && c == server.current_client && server.executing_client && !cmdHasPushAsReply(server.executing_client->cmd)) { _addReplyProtoToList(c,server.pending_push_messages,s,len); @@ -1116,14 +1105,18 @@ void addReplyVerbatim(client *c, const char *s, size_t len, const char *ext) { } } -/* Add an array of C strings as status replies with a heading. - * This function is typically invoked by from commands that support - * subcommands in response to the 'help' subcommand. The help array - * is terminated by NULL sentinel. */ -void addReplyHelp(client *c, const char **help) { +/* This function is similar to the addReplyHelp function but adds the + * ability to pass in two arrays of strings. Some commands have + * some additional subcommands based on the specific feature implementation + * Redis is compiled with (currently just clustering). This function allows + * to pass is the common subcommands in `help` and any implementation + * specific subcommands in `extended_help`. + */ +void addExtendedReplyHelp(client *c, const char **help, const char **extended_help) { sds cmd = sdsnew((char*) c->argv[0]->ptr); void *blenp = addReplyDeferredLen(c); int blen = 0; + int idx = 0; sdstoupper(cmd); addReplyStatusFormat(c, @@ -1131,6 +1124,10 @@ void addReplyHelp(client *c, const char **help) { sdsfree(cmd); while (help[blen]) addReplyStatus(c,help[blen++]); + if (extended_help) { + while (extended_help[idx]) addReplyStatus(c,extended_help[idx++]); + } + blen += idx; addReplyStatus(c,"HELP"); addReplyStatus(c," Print this help."); @@ -1140,6 +1137,14 @@ void addReplyHelp(client *c, const char **help) { setDeferredArrayLen(c,blenp,blen); } +/* Add an array of C strings as status replies with a heading. + * This function is typically invoked by commands that support + * subcommands in response to the 'help' subcommand. The help array + * is terminated by NULL sentinel. */ +void addReplyHelp(client *c, const char **help) { + addExtendedReplyHelp(c, help, NULL); +} + /* Add a suggestive error reply. * This function is typically invoked by from commands that support * subcommands in response to an unknown subcommand or argument error. */ @@ -1433,7 +1438,7 @@ void unlinkClient(client *c) { listNode *ln; /* If this is marked as current client unset it. */ - if (server.current_client == c) server.current_client = NULL; + if (c->conn && server.current_client == c) server.current_client = NULL; /* Certain operations must be done only if the client has an active connection. * If the client was already unlinked or if it's a "fake client" the @@ -1477,7 +1482,7 @@ void unlinkClient(client *c) { } /* Remove from the list of pending reads if needed. */ - serverAssert(io_threads_op == IO_THREADS_OP_IDLE); + serverAssert(!c->conn || io_threads_op == IO_THREADS_OP_IDLE); if (c->pending_read_list_node != NULL) { listDelNode(server.clients_pending_read,c->pending_read_list_node); c->pending_read_list_node = NULL; @@ -1529,6 +1534,7 @@ void clearClientConnectionState(client *c) { pubsubUnsubscribeAllChannels(c,0); pubsubUnsubscribeShardAllChannels(c, 0); pubsubUnsubscribeAllPatterns(c,0); + unmarkClientAsPubSub(c); if (c->name) { decrRefCount(c->name); @@ -1539,10 +1545,22 @@ void clearClientConnectionState(client *c) { * represent the client library behind the connection. */ /* Selectively clear state flags not covered above */ - c->flags &= ~(CLIENT_ASKING|CLIENT_READONLY|CLIENT_PUBSUB|CLIENT_REPLY_OFF| + c->flags &= ~(CLIENT_ASKING|CLIENT_READONLY|CLIENT_REPLY_OFF| CLIENT_REPLY_SKIP_NEXT|CLIENT_NO_TOUCH|CLIENT_NO_EVICT); } +void deauthenticateAndCloseClient(client *c) { + c->user = DefaultUser; + c->authenticated = 0; + /* We will write replies to this client later, so we can't + * close it directly even if async. */ + if (c == server.current_client) { + c->flags |= CLIENT_CLOSE_AFTER_COMMAND; + } else { + freeClientAsync(c); + } +} + void freeClient(client *c) { listNode *ln; @@ -1614,6 +1632,7 @@ void freeClient(client *c) { pubsubUnsubscribeAllChannels(c,0); pubsubUnsubscribeShardAllChannels(c, 0); pubsubUnsubscribeAllPatterns(c,0); + unmarkClientAsPubSub(c); dictRelease(c->pubsub_channels); dictRelease(c->pubsub_patterns); dictRelease(c->pubsubshard_channels); @@ -1630,6 +1649,12 @@ void freeClient(client *c) { reqresReset(c, 1); #endif + /* Remove the contribution that this client gave to our + * incrementally computed memory usage. */ + if (c->conn) + server.stat_clients_type_memory[c->last_memory_type] -= + c->last_memory_usage; + /* Unlink the client: this will close the socket, remove the I/O * handlers, and remove references of the client from different * places where active clients may be referenced. */ @@ -1678,10 +1703,6 @@ void freeClient(client *c) { * we lost the connection with the master. */ if (c->flags & CLIENT_MASTER) replicationHandleMasterDisconnection(); - /* Remove the contribution that this client gave to our - * incrementally computed memory usage. */ - server.stat_clients_type_memory[c->last_memory_type] -= - c->last_memory_usage; /* Remove client from memory usage buckets */ if (c->mem_usage_bucket) { c->mem_usage_bucket->mem_usage_sum -= c->last_memory_usage; @@ -1700,7 +1721,7 @@ void freeClient(client *c) { zfree(c); } -/* Schedule a client to free it at a safe time in the serverCron() function. +/* Schedule a client to free it at a safe time in the beforeSleep() function. * This function is useful when we need to terminate a client but we are in * a context where calling freeClient() is not possible, because the client * should be valid for the continuation of the flow of the program. */ @@ -1712,6 +1733,9 @@ void freeClientAsync(client *c) { * idle. */ if (c->flags & CLIENT_CLOSE_ASAP || c->flags & CLIENT_SCRIPT) return; c->flags |= CLIENT_CLOSE_ASAP; + /* Replicas that was marked as CLIENT_CLOSE_ASAP should not keep the + * replication backlog from been trimmed. */ + if (c->flags & CLIENT_SLAVE) freeReplicaReferencedReplBuffer(c); if (server.io_threads_num == 1) { /* no need to bother with locking if there's just one thread (the main thread) */ listAddNodeTail(server.clients_to_close,c); @@ -1793,8 +1817,9 @@ int freeClientsInAsyncFreeQueue(void) { * are not registered clients. */ client *lookupClientByID(uint64_t id) { id = htonu64(id); - client *c = raxFind(server.clients_index,(unsigned char*)&id,sizeof(id)); - return (c == raxNotFound) ? NULL : c; + void *c = NULL; + raxFind(server.clients_index,(unsigned char*)&id,sizeof(id),&c); + return c; } /* This function should be called from _writeToClient when the reply list is not empty, @@ -2467,7 +2492,7 @@ int processCommandAndResetClient(client *c) { commandProcessed(c); /* Update the client's memory to include output buffer growth following the * processed command. */ - updateClientMemUsageAndBucket(c); + if (c->conn) updateClientMemUsageAndBucket(c); } if (server.current_client == NULL) deadclient = 1; @@ -2697,7 +2722,13 @@ void readQueryFromClient(connection *conn) { atomicIncr(server.stat_net_input_bytes, nread); } - if (!(c->flags & CLIENT_MASTER) && sdslen(c->querybuf) > server.client_max_querybuf_len) { + if (!(c->flags & CLIENT_MASTER) && + /* The commands cached in the MULTI/EXEC queue have not been executed yet, + * so they are also considered a part of the query buffer in a broader sense. + * + * For unauthenticated clients, the query buffer cannot exceed 1MB at most. */ + (c->mstate.argv_len_sums + sdslen(c->querybuf) > server.client_max_querybuf_len || + (c->mstate.argv_len_sums + sdslen(c->querybuf) > 1024*1024 && authRequired(c)))) { sds ci = catClientInfoString(sdsempty(),c), bytes = sdsempty(); bytes = sdscatrepr(bytes,c->querybuf,64); @@ -2705,6 +2736,7 @@ void readQueryFromClient(connection *conn) { sdsfree(ci); sdsfree(bytes); freeClientAsync(c); + atomicIncr(server.stat_client_qbuf_limit_disconnections, 1); goto done; } @@ -2814,39 +2846,38 @@ sds catClientInfoString(sds s, client *client) { used_blocks_of_repl_buf = last->id - cur->id + 1; } - sds ret = sdscatfmt(s, - "id=%U addr=%s laddr=%s %s name=%s age=%I idle=%I flags=%s db=%i sub=%i psub=%i ssub=%i multi=%i qbuf=%U qbuf-free=%U argv-mem=%U multi-mem=%U rbs=%U rbp=%U obl=%U oll=%U omem=%U tot-mem=%U events=%s cmd=%s user=%s redir=%I resp=%i lib-name=%s lib-ver=%s", - (unsigned long long) client->id, - getClientPeerId(client), - getClientSockname(client), - connGetInfo(client->conn, conninfo, sizeof(conninfo)), - client->name ? (char*)client->name->ptr : "", - (long long)(server.unixtime - client->ctime), - (long long)(server.unixtime - client->lastinteraction), - flags, - client->db->id, - (int) dictSize(client->pubsub_channels), - (int) dictSize(client->pubsub_patterns), - (int) dictSize(client->pubsubshard_channels), - (client->flags & CLIENT_MULTI) ? client->mstate.count : -1, - (unsigned long long) sdslen(client->querybuf), - (unsigned long long) sdsavail(client->querybuf), - (unsigned long long) client->argv_len_sum, - (unsigned long long) client->mstate.argv_len_sums, - (unsigned long long) client->buf_usable_size, - (unsigned long long) client->buf_peak, - (unsigned long long) client->bufpos, - (unsigned long long) listLength(client->reply) + used_blocks_of_repl_buf, - (unsigned long long) obufmem, /* should not include client->buf since we want to see 0 for static clients. */ - (unsigned long long) total_mem, - events, - client->lastcmd ? client->lastcmd->fullname : "NULL", - client->user ? client->user->name : "(superuser)", - (client->flags & CLIENT_TRACKING) ? (long long) client->client_tracking_redirection : -1, - client->resp, - client->lib_name ? (char*)client->lib_name->ptr : "", - client->lib_ver ? (char*)client->lib_ver->ptr : "" - ); + sds ret = sdscatfmt(s, FMTARGS( + "id=%U", (unsigned long long) client->id, + " addr=%s", getClientPeerId(client), + " laddr=%s", getClientSockname(client), + " %s", connGetInfo(client->conn, conninfo, sizeof(conninfo)), + " name=%s", client->name ? (char*)client->name->ptr : "", + " age=%I", (long long)(commandTimeSnapshot() / 1000 - client->ctime), + " idle=%I", (long long)(server.unixtime - client->lastinteraction), + " flags=%s", flags, + " db=%i", client->db->id, + " sub=%i", (int) dictSize(client->pubsub_channels), + " psub=%i", (int) dictSize(client->pubsub_patterns), + " ssub=%i", (int) dictSize(client->pubsubshard_channels), + " multi=%i", (client->flags & CLIENT_MULTI) ? client->mstate.count : -1, + " watch=%i", (int) listLength(client->watched_keys), + " qbuf=%U", (unsigned long long) sdslen(client->querybuf), + " qbuf-free=%U", (unsigned long long) sdsavail(client->querybuf), + " argv-mem=%U", (unsigned long long) client->argv_len_sum, + " multi-mem=%U", (unsigned long long) client->mstate.argv_len_sums, + " rbs=%U", (unsigned long long) client->buf_usable_size, + " rbp=%U", (unsigned long long) client->buf_peak, + " obl=%U", (unsigned long long) client->bufpos, + " oll=%U", (unsigned long long) listLength(client->reply) + used_blocks_of_repl_buf, + " omem=%U", (unsigned long long) obufmem, /* should not include client->buf since we want to see 0 for static clients. */ + " tot-mem=%U", (unsigned long long) total_mem, + " events=%s", events, + " cmd=%s", client->lastcmd ? client->lastcmd->fullname : "NULL", + " user=%s", client->user ? client->user->name : "(superuser)", + " redir=%I", (client->flags & CLIENT_TRACKING) ? (long long) client->client_tracking_redirection : -1, + " resp=%i", client->resp, + " lib-name=%s", client->lib_name ? (char*)client->lib_name->ptr : "", + " lib-ver=%s", client->lib_ver ? (char*)client->lib_ver->ptr : "")); return ret; } @@ -3014,6 +3045,10 @@ void clientCommand(client *c) { " Kill connections authenticated by .", " * SKIPME (YES|NO)", " Skip killing current connection (default: yes).", +" * ID ", +" Kill connections by client id.", +" * MAXAGE ", +" Kill connections older than the specified age.", "LIST [options ...]", " Return information about client connections. Options:", " * TYPE (NORMAL|MASTER|REPLICA|PUBSUB)", @@ -3125,6 +3160,7 @@ NULL user *user = NULL; int type = -1; uint64_t id = 0; + long long max_age = 0; int skipme = 1; int killed = 0, close_this_client = 0; @@ -3146,6 +3182,18 @@ NULL "client-id should be greater than 0") != C_OK) return; id = tmp; + } else if (!strcasecmp(c->argv[i]->ptr,"maxage") && moreargs) { + long long tmp; + + if (getLongLongFromObjectOrReply(c, c->argv[i+1], &tmp, + "maxage is not an integer or out of range") != C_OK) + return; + if (tmp <= 0) { + addReplyError(c, "maxage should be greater than 0"); + return; + } + + max_age = tmp; } else if (!strcasecmp(c->argv[i]->ptr,"type") && moreargs) { type = getClientTypeByName(c->argv[i+1]->ptr); if (type == -1) { @@ -3195,6 +3243,7 @@ NULL if (id != 0 && client->id != id) continue; if (user && client->user != user) continue; if (c == client && skipme) continue; + if (max_age != 0 && (long long)(commandTimeSnapshot() / 1000 - client->ctime) < max_age) continue; /* Kill it. */ if (c == client) { @@ -3723,7 +3772,9 @@ void replaceClientCommandVector(client *c, int argc, robj **argv) { * 1. Make sure there are no "holes" and all the arguments are set. * 2. If the original argument vector was longer than the one we * want to end with, it's up to the caller to set c->argc and - * free the no longer used objects on c->argv. */ + * free the no longer used objects on c->argv. + * 3. To remove argument at i'th index, pass NULL as new value + */ void rewriteClientCommandArgument(client *c, int i, robj *newval) { robj *oldval; retainOriginalCommandVector(c); @@ -3741,9 +3792,18 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) { } oldval = c->argv[i]; if (oldval) c->argv_len_sum -= getStringObjectLen(oldval); - if (newval) c->argv_len_sum += getStringObjectLen(newval); - c->argv[i] = newval; - incrRefCount(newval); + + if (newval) { + c->argv[i] = newval; + incrRefCount(newval); + c->argv_len_sum += getStringObjectLen(newval); + } else { + /* move the remaining arguments one step left */ + for (int j = i+1; j < c->argc; j++) { + c->argv[j-1] = c->argv[j]; + } + c->argv[--c->argc] = NULL; + } if (oldval) decrRefCount(oldval); /* If this is the command name make sure to fix c->cmd. */ @@ -3808,7 +3868,7 @@ size_t getClientMemoryUsage(client *c, size_t *output_buffer_mem_usage) { * classes of clients. * * The function will return one of the following: - * CLIENT_TYPE_NORMAL -> Normal client + * CLIENT_TYPE_NORMAL -> Normal client, including MONITOR * CLIENT_TYPE_SLAVE -> Slave * CLIENT_TYPE_PUBSUB -> Client subscribed to Pub/Sub channels * CLIENT_TYPE_MASTER -> The client representing our replication master. @@ -3929,6 +3989,7 @@ int closeClientOnOutputBufferLimitReached(client *c, int async) { client); } sdsfree(client); + server.stat_client_outbuf_limit_disconnections++; return 1; } return 0; diff --git a/src/notify.c b/src/notify.c index 2881a48dba8..2377166995c 100644 --- a/src/notify.c +++ b/src/notify.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2013, Salvatore Sanfilippo + * Copyright (c) 2013-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -101,7 +80,7 @@ sds keyspaceEventsFlagsToString(int flags) { * 'event' is a C string representing the event name. * 'key' is a Redis object representing the key name. * 'dbid' is the database ID where the key lives. */ -void notifyKeyspaceEvent(int type, char *event, robj *key, int dbid) { +void notifyKeyspaceEvent(int type, const char *event, robj *key, int dbid) { sds chan; robj *chanobj, *eventobj; int len = -1; diff --git a/src/object.c b/src/object.c index 4b3526a02c6..2b42e7b3e63 100644 --- a/src/object.c +++ b/src/object.c @@ -1,31 +1,10 @@ /* Redis Object implementation. * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -232,8 +211,8 @@ robj *dupStringObject(const robj *o) { } } -robj *createQuicklistObject(void) { - quicklist *l = quicklistCreate(); +robj *createQuicklistObject(int fill, int compress) { + quicklist *l = quicklistNew(fill, compress); robj *o = createObject(OBJ_LIST,l); o->encoding = OBJ_ENCODING_QUICKLIST; return o; @@ -354,17 +333,7 @@ void freeZsetObject(robj *o) { } void freeHashObject(robj *o) { - switch (o->encoding) { - case OBJ_ENCODING_HT: - dictRelease((dict*) o->ptr); - break; - case OBJ_ENCODING_LISTPACK: - lpFree(o->ptr); - break; - default: - serverPanic("Unknown hash encoding type"); - break; - } + hashTypeFree(o); } void freeModuleObject(robj *o) { @@ -523,6 +492,9 @@ void dismissHashObject(robj *o, size_t size_hint) { dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1])*sizeof(dictEntry*)); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { dismissMemory(o->ptr, lpBytes((unsigned char*)o->ptr)); + } else if (o->encoding == OBJ_ENCODING_LISTPACK_EX) { + listpackEx *lpt = o->ptr; + dismissMemory(lpt->lp, lpBytes((unsigned char*)lpt->lp)); } else { serverPanic("Unknown hash encoding type"); } @@ -960,6 +932,7 @@ char *strEncoding(int encoding) { case OBJ_ENCODING_HT: return "hashtable"; case OBJ_ENCODING_QUICKLIST: return "quicklist"; case OBJ_ENCODING_LISTPACK: return "listpack"; + case OBJ_ENCODING_LISTPACK_EX: return "listpackex"; case OBJ_ENCODING_INTSET: return "intset"; case OBJ_ENCODING_SKIPLIST: return "skiplist"; case OBJ_ENCODING_EMBSTR: return "embstr"; @@ -1000,7 +973,6 @@ size_t streamRadixTreeMemoryUsage(rax *rax) { * are checked and averaged to estimate the total size. */ #define OBJ_COMPUTE_SIZE_DEF_SAMPLES 5 /* Default sample size. */ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { - sds ele, ele2; dict *d; dictIterator *di; struct dictEntry *de; @@ -1035,9 +1007,9 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { if (o->encoding == OBJ_ENCODING_HT) { d = o->ptr; di = dictGetIterator(d); - asize = sizeof(*o)+sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d)); + asize = sizeof(*o)+sizeof(dict)+(sizeof(struct dictEntry*)*dictBuckets(d)); while((de = dictNext(di)) != NULL && samples < sample_size) { - ele = dictGetKey(de); + sds ele = dictGetKey(de); elesize += dictEntryMemUsage() + sdsZmallocSize(ele); samples++; } @@ -1058,7 +1030,7 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { zskiplist *zsl = ((zset*)o->ptr)->zsl; zskiplistNode *znode = zsl->header->level[0].forward; asize = sizeof(*o)+sizeof(zset)+sizeof(zskiplist)+sizeof(dict)+ - (sizeof(struct dictEntry*)*dictSlots(d))+ + (sizeof(struct dictEntry*)*dictBuckets(d))+ zmalloc_size(zsl->header); while(znode != NULL && samples < sample_size) { elesize += sdsZmallocSize(znode->ele); @@ -1073,14 +1045,17 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { } else if (o->type == OBJ_HASH) { if (o->encoding == OBJ_ENCODING_LISTPACK) { asize = sizeof(*o)+zmalloc_size(o->ptr); + } else if (o->encoding == OBJ_ENCODING_LISTPACK_EX) { + listpackEx *lpt = o->ptr; + asize = sizeof(*o) + zmalloc_size(lpt) + zmalloc_size(lpt->lp); } else if (o->encoding == OBJ_ENCODING_HT) { d = o->ptr; di = dictGetIterator(d); - asize = sizeof(*o)+sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d)); + asize = sizeof(*o)+sizeof(dict)+(sizeof(struct dictEntry*)*dictBuckets(d)); while((de = dictNext(di)) != NULL && samples < sample_size) { - ele = dictGetKey(de); - ele2 = dictGetVal(de); - elesize += sdsZmallocSize(ele) + sdsZmallocSize(ele2); + hfield ele = dictGetKey(de); + sds ele2 = dictGetVal(de); + elesize += hfieldZmallocSize(ele) + sdsZmallocSize(ele2); elesize += dictEntryMemUsage(); samples++; } @@ -1183,10 +1158,15 @@ struct redisMemOverhead *getMemoryOverheadData(void) { (float)server.cron_malloc_stats.process_rss / server.cron_malloc_stats.zmalloc_used; mh->total_frag_bytes = server.cron_malloc_stats.process_rss - server.cron_malloc_stats.zmalloc_used; - mh->allocator_frag = - (float)server.cron_malloc_stats.allocator_active / server.cron_malloc_stats.allocator_allocated; - mh->allocator_frag_bytes = - server.cron_malloc_stats.allocator_active - server.cron_malloc_stats.allocator_allocated; + /* Starting with redis 7.4, the lua memory is part of the total memory usage + * of redis, and that includes RSS and all other memory metrics. We only want + * to deduct it from active defrag. */ + size_t frag_smallbins_bytes = + server.cron_malloc_stats.allocator_frag_smallbins_bytes - server.cron_malloc_stats.lua_allocator_frag_smallbins_bytes; + size_t allocated = + server.cron_malloc_stats.allocator_allocated - server.cron_malloc_stats.lua_allocator_allocated; + mh->allocator_frag = (float)frag_smallbins_bytes / allocated + 1; + mh->allocator_frag_bytes = frag_smallbins_bytes; mh->allocator_rss = (float)server.cron_malloc_stats.allocator_resident / server.cron_malloc_stats.allocator_active; mh->allocator_rss_bytes = @@ -1246,29 +1226,31 @@ struct redisMemOverhead *getMemoryOverheadData(void) { for (j = 0; j < server.dbnum; j++) { redisDb *db = server.db+j; - long long keyscount = dictSize(db->dict); - if (keyscount==0) continue; + if (!kvstoreNumAllocatedDicts(db->keys)) continue; + + unsigned long long keyscount = kvstoreSize(db->keys); mh->total_keys += keyscount; mh->db = zrealloc(mh->db,sizeof(mh->db[0])*(mh->num_dbs+1)); mh->db[mh->num_dbs].dbid = j; - mem = dictMemUsage(db->dict) + - dictSize(db->dict) * sizeof(robj); + mem = kvstoreMemUsage(db->keys) + + keyscount * sizeof(robj); mh->db[mh->num_dbs].overhead_ht_main = mem; mem_total+=mem; - mem = dictMemUsage(db->expires); + mem = kvstoreMemUsage(db->expires); mh->db[mh->num_dbs].overhead_ht_expires = mem; mem_total+=mem; - /* Account for the slot to keys map in cluster mode */ - mem = dictSize(db->dict) * dictEntryMetadataSize(db->dict) + - dictMetadataSize(db->dict); - mh->db[mh->num_dbs].overhead_ht_slot_to_keys = mem; - mem_total+=mem; - mh->num_dbs++; + + mh->overhead_db_hashtable_lut += kvstoreOverheadHashtableLut(db->keys); + mh->overhead_db_hashtable_lut += kvstoreOverheadHashtableLut(db->expires); + mh->overhead_db_hashtable_rehashing += kvstoreOverheadHashtableRehashing(db->keys); + mh->overhead_db_hashtable_rehashing += kvstoreOverheadHashtableRehashing(db->expires); + mh->db_dict_rehashing_count += kvstoreDictRehashingCount(db->keys); + mh->db_dict_rehashing_count += kvstoreDictRehashingCount(db->expires); } mh->overhead_total = mem_total; @@ -1281,7 +1263,7 @@ struct redisMemOverhead *getMemoryOverheadData(void) { if (zmalloc_used > mh->startup_allocated) net_usage = zmalloc_used - mh->startup_allocated; mh->dataset_perc = (float)mh->dataset*100/net_usage; - mh->bytes_per_key = mh->total_keys ? (net_usage / mh->total_keys) : 0; + mh->bytes_per_key = mh->total_keys ? (mh->dataset / mh->total_keys) : 0; return mh; } @@ -1551,19 +1533,18 @@ NULL return; } } - if ((de = dictFind(c->db->dict,c->argv[2]->ptr)) == NULL) { + if ((de = dbFind(c->db, c->argv[2]->ptr)) == NULL) { addReplyNull(c); return; } size_t usage = objectComputeSize(c->argv[2],dictGetVal(de),samples,c->db->id); usage += sdsZmallocSize(dictGetKey(de)); usage += dictEntryMemUsage(); - usage += dictMetadataSize(c->db->dict); addReplyLongLong(c,usage); } else if (!strcasecmp(c->argv[1]->ptr,"stats") && c->argc == 2) { struct redisMemOverhead *mh = getMemoryOverheadData(); - addReplyMapLen(c,27+mh->num_dbs); + addReplyMapLen(c,31+mh->num_dbs); addReplyBulkCString(c,"peak.allocated"); addReplyLongLong(c,mh->peak_allocated); @@ -1599,22 +1580,27 @@ NULL char dbname[32]; snprintf(dbname,sizeof(dbname),"db.%zd",mh->db[j].dbid); addReplyBulkCString(c,dbname); - addReplyMapLen(c,3); + addReplyMapLen(c,2); addReplyBulkCString(c,"overhead.hashtable.main"); addReplyLongLong(c,mh->db[j].overhead_ht_main); addReplyBulkCString(c,"overhead.hashtable.expires"); addReplyLongLong(c,mh->db[j].overhead_ht_expires); - - addReplyBulkCString(c,"overhead.hashtable.slot-to-keys"); - addReplyLongLong(c,mh->db[j].overhead_ht_slot_to_keys); } + addReplyBulkCString(c,"overhead.db.hashtable.lut"); + addReplyLongLong(c, mh->overhead_db_hashtable_lut); + + addReplyBulkCString(c,"overhead.db.hashtable.rehashing"); + addReplyLongLong(c, mh->overhead_db_hashtable_rehashing); addReplyBulkCString(c,"overhead.total"); addReplyLongLong(c,mh->overhead_total); + addReplyBulkCString(c,"db.dict.rehashing.count"); + addReplyLongLong(c, mh->db_dict_rehashing_count); + addReplyBulkCString(c,"keys.count"); addReplyLongLong(c,mh->total_keys); @@ -1639,6 +1625,9 @@ NULL addReplyBulkCString(c,"allocator.resident"); addReplyLongLong(c,server.cron_malloc_stats.allocator_resident); + addReplyBulkCString(c,"allocator.muzzy"); + addReplyLongLong(c,server.cron_malloc_stats.allocator_muzzy); + addReplyBulkCString(c,"allocator-fragmentation.ratio"); addReplyDouble(c,mh->allocator_frag); diff --git a/src/pqsort.c b/src/pqsort.c index fab54e026a2..62527170573 100644 --- a/src/pqsort.c +++ b/src/pqsort.c @@ -1,7 +1,7 @@ /* The following is the NetBSD libc qsort implementation modified in order to * support partial sorting of ranges for Redis. * - * Copyright(C) 2009-2012 Salvatore Sanfilippo. All rights reserved. + * Copyright(C) 2009-current Redis Ltd.. All rights reserved. * * The original copyright notice follows. */ diff --git a/src/pqsort.h b/src/pqsort.h index 824ab5c0969..621147424e7 100644 --- a/src/pqsort.h +++ b/src/pqsort.h @@ -1,32 +1,11 @@ /* The following is the NetBSD libc qsort implementation modified in order to * support partial sorting of ranges for Redis. * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). * * See the pqsort.c file for the original copyright notice. */ diff --git a/src/pubsub.c b/src/pubsub.c index a13c5a61fbe..25099055f6b 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -36,7 +15,7 @@ typedef struct pubsubtype { int shard; dict *(*clientPubSubChannels)(client*); int (*subscriptionCount)(client*); - dict **serverPubSubChannels; + kvstore **serverPubSubChannels; robj **subscribeMsg; robj **unsubscribeMsg; robj **messageBulk; @@ -67,7 +46,7 @@ dict* getClientPubSubShardChannels(client *c); * If a pattern is provided, the subset of channels is returned * matching the pattern. */ -void channelList(client *c, sds pat, dict* pubsub_channels); +void channelList(client *c, sds pat, kvstore *pubsub_channels); /* * Pub/Sub type for global channels. @@ -208,15 +187,14 @@ void addReplyPubsubPatUnsubscribed(client *c, robj *pattern) { /* Return the number of pubsub channels + patterns is handled. */ int serverPubsubSubscriptionCount(void) { - return dictSize(server.pubsub_channels) + dictSize(server.pubsub_patterns); + return kvstoreSize(server.pubsub_channels) + dictSize(server.pubsub_patterns); } /* Return the number of pubsub shard level channels is handled. */ int serverPubsubShardSubscriptionCount(void) { - return dictSize(server.pubsubshard_channels); + return kvstoreSize(server.pubsubshard_channels); } - /* Return the number of channels + patterns a client is subscribed to. */ int clientSubscriptionsCount(client *c) { return dictSize(c->pubsub_channels) + dictSize(c->pubsub_patterns); @@ -241,27 +219,51 @@ int clientTotalPubSubSubscriptionCount(client *c) { return clientSubscriptionsCount(c) + clientShardSubscriptionsCount(c); } +void markClientAsPubSub(client *c) { + if (!(c->flags & CLIENT_PUBSUB)) { + c->flags |= CLIENT_PUBSUB; + server.pubsub_clients++; + } +} + +void unmarkClientAsPubSub(client *c) { + if (c->flags & CLIENT_PUBSUB) { + c->flags &= ~CLIENT_PUBSUB; + server.pubsub_clients--; + } +} + /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or * 0 if the client was already subscribed to that channel. */ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { - dictEntry *de; - list *clients = NULL; + dictEntry *de, *existing; + dict *clients = NULL; int retval = 0; + unsigned int slot = 0; /* Add the channel to the client -> channels hash table */ - if (dictAdd(type.clientPubSubChannels(c),channel,NULL) == DICT_OK) { + void *position = dictFindPositionForInsert(type.clientPubSubChannels(c),channel,NULL); + if (position) { /* Not yet subscribed to this channel */ retval = 1; - incrRefCount(channel); /* Add the client to the channel -> list of clients hash table */ - de = dictFind(*type.serverPubSubChannels, channel); - if (de == NULL) { - clients = listCreate(); - dictAdd(*type.serverPubSubChannels, channel, clients); - incrRefCount(channel); + if (server.cluster_enabled && type.shard) { + slot = getKeySlot(channel->ptr); + } + + de = kvstoreDictAddRaw(*type.serverPubSubChannels, slot, channel, &existing); + + if (existing) { + clients = dictGetVal(existing); + channel = dictGetKey(existing); } else { - clients = dictGetVal(de); + clients = dictCreate(&clientDictType); + kvstoreDictSetVal(*type.serverPubSubChannels, slot, de, clients); + incrRefCount(channel); } - listAddNodeTail(clients,c); + + serverAssert(dictAdd(clients, c, NULL) != DICT_ERR); + serverAssert(dictInsertAtPosition(type.clientPubSubChannels(c), channel, position)); + incrRefCount(channel); } /* Notify the client */ addReplyPubsubSubscribed(c,channel,type); @@ -272,9 +274,9 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { * 0 if the client was not subscribed to the specified channel. */ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype type) { dictEntry *de; - list *clients; - listNode *ln; + dict *clients; int retval = 0; + int slot = 0; /* Remove the channel from the client -> channels hash table */ incrRefCount(channel); /* channel may be just a pointer to the same object @@ -282,22 +284,18 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty if (dictDelete(type.clientPubSubChannels(c),channel) == DICT_OK) { retval = 1; /* Remove the client from the channel -> clients list hash table */ - de = dictFind(*type.serverPubSubChannels, channel); + if (server.cluster_enabled && type.shard) { + slot = getKeySlot(channel->ptr); + } + de = kvstoreDictFind(*type.serverPubSubChannels, slot, channel); serverAssertWithInfo(c,NULL,de != NULL); clients = dictGetVal(de); - ln = listSearchKey(clients,c); - serverAssertWithInfo(c,NULL,ln != NULL); - listDelNode(clients,ln); - if (listLength(clients) == 0) { - /* Free the list and associated hash entry at all if this was + serverAssertWithInfo(c, NULL, dictDelete(clients, c) == DICT_OK); + if (dictSize(clients) == 0) { + /* Free the dict and associated hash entry at all if this was * the latest client, so that it will be possible to abuse * Redis PUBSUB creating millions of channels. */ - dictDelete(*type.serverPubSubChannels, channel); - /* As this channel isn't subscribed by anyone, it's safe - * to remove the channel from the slot. */ - if (server.cluster_enabled & type.shard) { - slotToChannelDel(channel->ptr); - } + kvstoreDictDelete(*type.serverPubSubChannels, slot, channel); } } /* Notify the client */ @@ -308,41 +306,40 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty return retval; } -void pubsubShardUnsubscribeAllClients(robj *channel) { - int retval; - dictEntry *de = dictFind(server.pubsubshard_channels, channel); - serverAssertWithInfo(NULL,channel,de != NULL); - list *clients = dictGetVal(de); - if (listLength(clients) > 0) { +/* Unsubscribe all shard channels in a slot. */ +void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { + if (!kvstoreDictSize(server.pubsubshard_channels, slot)) + return; + + kvstoreDictIterator *kvs_di = kvstoreGetDictSafeIterator(server.pubsubshard_channels, slot); + dictEntry *de; + while ((de = kvstoreDictIteratorNext(kvs_di)) != NULL) { + robj *channel = dictGetKey(de); + dict *clients = dictGetVal(de); /* For each client subscribed to the channel, unsubscribe it. */ - listIter li; - listNode *ln; - listRewind(clients, &li); - while ((ln = listNext(&li)) != NULL) { - client *c = listNodeValue(ln); - retval = dictDelete(c->pubsubshard_channels, channel); + dictIterator *iter = dictGetIterator(clients); + dictEntry *entry; + while ((entry = dictNext(iter)) != NULL) { + client *c = dictGetKey(entry); + int retval = dictDelete(c->pubsubshard_channels, channel); serverAssertWithInfo(c,channel,retval == DICT_OK); addReplyPubsubUnsubscribed(c, channel, pubSubShardType); /* If the client has no other pubsub subscription, * move out of pubsub mode. */ if (clientTotalPubSubSubscriptionCount(c) == 0) { - c->flags &= ~CLIENT_PUBSUB; + unmarkClientAsPubSub(c); } } + dictReleaseIterator(iter); + kvstoreDictDelete(server.pubsubshard_channels, slot, channel); } - /* Delete the channel from server pubsubshard channels hash table. */ - retval = dictDelete(server.pubsubshard_channels, channel); - /* Delete the channel from slots_to_channel mapping. */ - slotToChannelDel(channel->ptr); - serverAssertWithInfo(NULL,channel,retval == DICT_OK); - decrRefCount(channel); /* it is finally safe to release it */ + kvstoreReleaseDictIterator(kvs_di); } - /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the client was already subscribed to that pattern. */ int pubsubSubscribePattern(client *c, robj *pattern) { dictEntry *de; - list *clients; + dict *clients; int retval = 0; if (dictAdd(c->pubsub_patterns, pattern, NULL) == DICT_OK) { @@ -351,13 +348,13 @@ int pubsubSubscribePattern(client *c, robj *pattern) { /* Add the client to the pattern -> list of clients hash table */ de = dictFind(server.pubsub_patterns,pattern); if (de == NULL) { - clients = listCreate(); + clients = dictCreate(&clientDictType); dictAdd(server.pubsub_patterns,pattern,clients); incrRefCount(pattern); } else { clients = dictGetVal(de); } - listAddNodeTail(clients,c); + serverAssert(dictAdd(clients, c, NULL) != DICT_ERR); } /* Notify the client */ addReplyPubsubPatSubscribed(c,pattern); @@ -368,8 +365,7 @@ int pubsubSubscribePattern(client *c, robj *pattern) { * 0 if the client was not subscribed to the specified channel. */ int pubsubUnsubscribePattern(client *c, robj *pattern, int notify) { dictEntry *de; - list *clients; - listNode *ln; + dict *clients; int retval = 0; incrRefCount(pattern); /* Protect the object. May be the same we remove */ @@ -379,11 +375,9 @@ int pubsubUnsubscribePattern(client *c, robj *pattern, int notify) { de = dictFind(server.pubsub_patterns,pattern); serverAssertWithInfo(c,NULL,de != NULL); clients = dictGetVal(de); - ln = listSearchKey(clients,c); - serverAssertWithInfo(c,NULL,ln != NULL); - listDelNode(clients,ln); - if (listLength(clients) == 0) { - /* Free the list and associated hash entry at all if this was + serverAssertWithInfo(c, NULL, dictDelete(clients, c) == DICT_OK); + if (dictSize(clients) == 0) { + /* Free the dict and associated hash entry at all if this was * the latest client. */ dictDelete(server.pubsub_patterns,pattern); } @@ -432,17 +426,6 @@ int pubsubUnsubscribeShardAllChannels(client *c, int notify) { return count; } -/* - * Unsubscribe a client from provided shard subscribed channel(s). - */ -void pubsubUnsubscribeShardChannels(robj **channels, unsigned int count) { - for (unsigned int j = 0; j < count; j++) { - /* Remove the channel from server and from the clients - * subscribed to it as well as notify them. */ - pubsubShardUnsubscribeAllClients(channels[j]); - } -} - /* Unsubscribe from all the patterns. Return the number of patterns the * client was subscribed from. */ int pubsubUnsubscribeAllPatterns(client *c, int notify) { @@ -471,23 +454,24 @@ int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type) int receivers = 0; dictEntry *de; dictIterator *di; - listNode *ln; - listIter li; + unsigned int slot = 0; /* Send to clients listening for that channel */ - de = dictFind(*type.serverPubSubChannels, channel); + if (server.cluster_enabled && type.shard) { + slot = keyHashSlot(channel->ptr, sdslen(channel->ptr)); + } + de = kvstoreDictFind(*type.serverPubSubChannels, slot, channel); if (de) { - list *list = dictGetVal(de); - listNode *ln; - listIter li; - - listRewind(list,&li); - while ((ln = listNext(&li)) != NULL) { - client *c = ln->value; + dict *clients = dictGetVal(de); + dictEntry *entry; + dictIterator *iter = dictGetIterator(clients); + while ((entry = dictNext(iter)) != NULL) { + client *c = dictGetKey(entry); addReplyPubsubMessage(c,channel,message,*type.messageBulk); updateClientMemUsageAndBucket(c); receivers++; } + dictReleaseIterator(iter); } if (type.shard) { @@ -501,19 +485,21 @@ int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type) channel = getDecodedObject(channel); while((de = dictNext(di)) != NULL) { robj *pattern = dictGetKey(de); - list *clients = dictGetVal(de); + dict *clients = dictGetVal(de); if (!stringmatchlen((char*)pattern->ptr, sdslen(pattern->ptr), (char*)channel->ptr, sdslen(channel->ptr),0)) continue; - listRewind(clients,&li); - while ((ln = listNext(&li)) != NULL) { - client *c = listNodeValue(ln); + dictEntry *entry; + dictIterator *iter = dictGetIterator(clients); + while ((entry = dictNext(iter)) != NULL) { + client *c = dictGetKey(entry); addReplyPubsubPatMessage(c,pattern,channel,message); updateClientMemUsageAndBucket(c); receivers++; } + dictReleaseIterator(iter); } decrRefCount(channel); dictReleaseIterator(di); @@ -546,7 +532,7 @@ void subscribeCommand(client *c) { } for (j = 1; j < c->argc; j++) pubsubSubscribeChannel(c,c->argv[j],pubSubType); - c->flags |= CLIENT_PUBSUB; + markClientAsPubSub(c); } /* UNSUBSCRIBE [channel ...] */ @@ -559,7 +545,9 @@ void unsubscribeCommand(client *c) { for (j = 1; j < c->argc; j++) pubsubUnsubscribeChannel(c,c->argv[j],1,pubSubType); } - if (clientTotalPubSubSubscriptionCount(c) == 0) c->flags &= ~CLIENT_PUBSUB; + if (clientTotalPubSubSubscriptionCount(c) == 0) { + unmarkClientAsPubSub(c); + } } /* PSUBSCRIBE pattern [pattern ...] */ @@ -579,7 +567,7 @@ void psubscribeCommand(client *c) { for (j = 1; j < c->argc; j++) pubsubSubscribePattern(c,c->argv[j]); - c->flags |= CLIENT_PUBSUB; + markClientAsPubSub(c); } /* PUNSUBSCRIBE [pattern [pattern ...]] */ @@ -592,7 +580,9 @@ void punsubscribeCommand(client *c) { for (j = 1; j < c->argc; j++) pubsubUnsubscribePattern(c,c->argv[j],1); } - if (clientTotalPubSubSubscriptionCount(c) == 0) c->flags &= ~CLIENT_PUBSUB; + if (clientTotalPubSubSubscriptionCount(c) == 0) { + unmarkClientAsPubSub(c); + } } /* This function wraps pubsubPublishMessage and also propagates the message to cluster. @@ -647,10 +637,10 @@ NULL addReplyArrayLen(c,(c->argc-2)*2); for (j = 2; j < c->argc; j++) { - list *l = dictFetchValue(server.pubsub_channels,c->argv[j]); + dict *d = kvstoreDictFetchValue(server.pubsub_channels, 0, c->argv[j]); addReplyBulk(c,c->argv[j]); - addReplyLongLong(c,l ? listLength(l) : 0); + addReplyLongLong(c, d ? dictSize(d) : 0); } } else if (!strcasecmp(c->argv[1]->ptr,"numpat") && c->argc == 2) { /* PUBSUB NUMPAT */ @@ -664,38 +654,43 @@ NULL } else if (!strcasecmp(c->argv[1]->ptr,"shardnumsub") && c->argc >= 2) { /* PUBSUB SHARDNUMSUB [ShardChannel_1 ... ShardChannel_N] */ int j; - addReplyArrayLen(c, (c->argc-2)*2); for (j = 2; j < c->argc; j++) { - list *l = dictFetchValue(server.pubsubshard_channels, c->argv[j]); + unsigned int slot = calculateKeySlot(c->argv[j]->ptr); + dict *clients = kvstoreDictFetchValue(server.pubsubshard_channels, slot, c->argv[j]); addReplyBulk(c,c->argv[j]); - addReplyLongLong(c,l ? listLength(l) : 0); + addReplyLongLong(c, clients ? dictSize(clients) : 0); } } else { addReplySubcommandSyntaxError(c); } } -void channelList(client *c, sds pat, dict *pubsub_channels) { - dictIterator *di = dictGetIterator(pubsub_channels); - dictEntry *de; +void channelList(client *c, sds pat, kvstore *pubsub_channels) { long mblen = 0; void *replylen; + unsigned int slot_cnt = kvstoreNumDicts(pubsub_channels); replylen = addReplyDeferredLen(c); - while((de = dictNext(di)) != NULL) { - robj *cobj = dictGetKey(de); - sds channel = cobj->ptr; - - if (!pat || stringmatchlen(pat, sdslen(pat), - channel, sdslen(channel),0)) - { - addReplyBulk(c,cobj); - mblen++; + for (unsigned int i = 0; i < slot_cnt; i++) { + if (!kvstoreDictSize(pubsub_channels, i)) + continue; + kvstoreDictIterator *kvs_di = kvstoreGetDictIterator(pubsub_channels, i); + dictEntry *de; + while((de = kvstoreDictIteratorNext(kvs_di)) != NULL) { + robj *cobj = dictGetKey(de); + sds channel = cobj->ptr; + + if (!pat || stringmatchlen(pat, sdslen(pat), + channel, sdslen(channel),0)) + { + addReplyBulk(c,cobj); + mblen++; + } } + kvstoreReleaseDictIterator(kvs_di); } - dictReleaseIterator(di); setDeferredArrayLen(c,replylen,mblen); } @@ -717,20 +712,11 @@ void ssubscribeCommand(client *c) { } for (int j = 1; j < c->argc; j++) { - /* A channel is only considered to be added, if a - * subscriber exists for it. And if a subscriber - * already exists the slotToChannel doesn't needs - * to be incremented. */ - if (server.cluster_enabled & - (dictFind(*pubSubShardType.serverPubSubChannels, c->argv[j]) == NULL)) { - slotToChannelAdd(c->argv[j]->ptr); - } pubsubSubscribeChannel(c, c->argv[j], pubSubShardType); } - c->flags |= CLIENT_PUBSUB; + markClientAsPubSub(c); } - /* SUNSUBSCRIBE [shardchannel [shardchannel ...]] */ void sunsubscribeCommand(client *c) { if (c->argc == 1) { @@ -740,7 +726,9 @@ void sunsubscribeCommand(client *c) { pubsubUnsubscribeChannel(c, c->argv[j], 1, pubSubShardType); } } - if (clientTotalPubSubSubscriptionCount(c) == 0) c->flags &= ~CLIENT_PUBSUB; + if (clientTotalPubSubSubscriptionCount(c) == 0) { + unmarkClientAsPubSub(c); + } } size_t pubsubMemOverhead(client *c) { @@ -752,3 +740,9 @@ size_t pubsubMemOverhead(client *c) { mem += dictMemUsage(c->pubsubshard_channels); return mem; } + +int pubsubTotalSubscriptions(void) { + return dictSize(server.pubsub_patterns) + + kvstoreSize(server.pubsub_channels) + + kvstoreSize(server.pubsubshard_channels); +} diff --git a/src/quicklist.c b/src/quicklist.c index 301a2166ee7..7fe3430fced 100644 --- a/src/quicklist.c +++ b/src/quicklist.c @@ -48,18 +48,14 @@ * just one byte, it still won't overflow the 16 bit count field. */ static const size_t optimization_level[] = {4096, 8192, 16384, 32768, 65536}; -/* packed_threshold is initialized to 1gb*/ -static size_t packed_threshold = (1 << 30); +/* This is for test suite development purposes only, 0 means disabled. */ +static size_t packed_threshold = 0; -/* set threshold for PLAIN nodes, the real limit is 4gb */ -#define isLargeElement(size) ((size) >= packed_threshold) - -int quicklistisSetPackedThreshold(size_t sz) { +/* set threshold for PLAIN nodes for test suit, the real limit is based on `fill` */ +int quicklistSetPackedThreshold(size_t sz) { /* Don't allow threshold to be set above or even slightly below 4GB */ if (sz > (1ull<<32) - (1<<20)) { return 0; - } else if (sz == 0) { /* 0 means restore threshold */ - sz = (1 << 30); } packed_threshold = sz; return 1; @@ -104,6 +100,9 @@ quicklistBookmark *_quicklistBookmarkFindByName(quicklist *ql, const char *name) quicklistBookmark *_quicklistBookmarkFindByNode(quicklist *ql, quicklistNode *node); void _quicklistBookmarkDelete(quicklist *ql, quicklistBookmark *bm); +REDIS_STATIC quicklistNode *_quicklistSplitNode(quicklistNode *node, int offset, int after); +REDIS_STATIC quicklistNode *_quicklistMergeNodes(quicklist *quicklist, quicklistNode *center); + /* Simple way to give quicklistEntry structs default values with one call. */ #define initEntry(e) \ do { \ @@ -158,9 +157,9 @@ void quicklistSetFill(quicklist *quicklist, int fill) { quicklist->fill = fill; } -void quicklistSetOptions(quicklist *quicklist, int fill, int depth) { +void quicklistSetOptions(quicklist *quicklist, int fill, int compress) { quicklistSetFill(quicklist, fill); - quicklistSetCompressDepth(quicklist, depth); + quicklistSetCompressDepth(quicklist, compress); } /* Create a new quicklist with some default parameters. */ @@ -378,6 +377,15 @@ REDIS_STATIC void __quicklistCompress(const quicklist *quicklist, quicklistCompressNode(reverse); } +/* This macro is used to compress a node. + * + * If the 'recompress' flag of the node is true, we compress it directly without + * checking whether it is within the range of compress depth. + * However, it's important to ensure that the 'recompress' flag of head and tail + * is always false, as we always assume that head and tail are not compressed. + * + * If the 'recompress' flag of the node is false, we check whether the node is + * within the range of compress depth before compressing it. */ #define quicklistCompress(_ql, _node) \ do { \ if ((_node)->recompress) \ @@ -450,6 +458,15 @@ REDIS_STATIC void _quicklistInsertNodeAfter(quicklist *quicklist, #define sizeMeetsSafetyLimit(sz) ((sz) <= SIZE_SAFETY_LIMIT) +/* Calculate the size limit of the quicklist node based on negative 'fill'. */ +static size_t quicklistNodeNegFillLimit(int fill) { + assert(fill < 0); + size_t offset = (-fill) - 1; + size_t max_level = sizeof(optimization_level) / sizeof(*optimization_level); + if (offset >= max_level) offset = max_level - 1; + return optimization_level[offset]; +} + /* Calculate the size limit or length limit of the quicklist node * based on 'fill', and is also used to limit list listpack. */ void quicklistNodeLimit(int fill, size_t *size, unsigned int *count) { @@ -460,10 +477,7 @@ void quicklistNodeLimit(int fill, size_t *size, unsigned int *count) { /* Ensure that one node have at least one entry */ *count = (fill == 0) ? 1 : fill; } else { - size_t offset = (-fill) - 1; - size_t max_level = sizeof(optimization_level) / sizeof(*optimization_level); - if (offset >= max_level) offset = max_level - 1; - *size = optimization_level[offset]; + *size = quicklistNodeNegFillLimit(fill); } } @@ -488,12 +502,23 @@ int quicklistNodeExceedsLimit(int fill, size_t new_sz, unsigned int new_count) { redis_unreachable(); } +/* Determines whether a given size qualifies as a large element based on a threshold + * determined by the 'fill'. If the size is considered large, it will be stored in + * a plain node. */ +static int isLargeElement(size_t sz, int fill) { + if (unlikely(packed_threshold != 0)) return sz >= packed_threshold; + if (fill >= 0) + return !sizeMeetsSafetyLimit(sz); + else + return sz > quicklistNodeNegFillLimit(fill); +} + REDIS_STATIC int _quicklistNodeAllowInsert(const quicklistNode *node, const int fill, const size_t sz) { if (unlikely(!node)) return 0; - if (unlikely(QL_NODE_IS_PLAIN(node) || isLargeElement(sz))) + if (unlikely(QL_NODE_IS_PLAIN(node) || isLargeElement(sz, fill))) return 0; /* Estimate how many bytes will be added to the listpack by this one entry. @@ -529,19 +554,25 @@ REDIS_STATIC int _quicklistNodeAllowMerge(const quicklistNode *a, (node)->sz = lpBytes((node)->entry); \ } while (0) -static quicklistNode* __quicklistCreatePlainNode(void *value, size_t sz) { +static quicklistNode* __quicklistCreateNode(int container, void *value, size_t sz) { quicklistNode *new_node = quicklistCreateNode(); - new_node->entry = zmalloc(sz); - new_node->container = QUICKLIST_NODE_CONTAINER_PLAIN; - memcpy(new_node->entry, value, sz); + new_node->container = container; + if (container == QUICKLIST_NODE_CONTAINER_PLAIN) { + new_node->entry = zmalloc(sz); + memcpy(new_node->entry, value, sz); + } else { + new_node->entry = lpPrepend(lpNew(0), value, sz); + } new_node->sz = sz; new_node->count++; return new_node; } static void __quicklistInsertPlainNode(quicklist *quicklist, quicklistNode *old_node, - void *value, size_t sz, int after) { - __quicklistInsertNode(quicklist, old_node, __quicklistCreatePlainNode(value, sz), after); + void *value, size_t sz, int after) +{ + quicklistNode *new_node = __quicklistCreateNode(QUICKLIST_NODE_CONTAINER_PLAIN, value, sz); + __quicklistInsertNode(quicklist, old_node, new_node, after); quicklist->count++; } @@ -552,7 +583,7 @@ static void __quicklistInsertPlainNode(quicklist *quicklist, quicklistNode *old_ int quicklistPushHead(quicklist *quicklist, void *value, size_t sz) { quicklistNode *orig_head = quicklist->head; - if (unlikely(isLargeElement(sz))) { + if (unlikely(isLargeElement(sz, quicklist->fill))) { __quicklistInsertPlainNode(quicklist, quicklist->head, value, sz, 0); return 1; } @@ -579,7 +610,7 @@ int quicklistPushHead(quicklist *quicklist, void *value, size_t sz) { * Returns 1 if new tail created. */ int quicklistPushTail(quicklist *quicklist, void *value, size_t sz) { quicklistNode *orig_tail = quicklist->tail; - if (unlikely(isLargeElement(sz))) { + if (unlikely(isLargeElement(sz, quicklist->fill))) { __quicklistInsertPlainNode(quicklist, quicklist->tail, value, sz, 1); return 1; } @@ -741,14 +772,18 @@ void quicklistReplaceEntry(quicklistIter *iter, quicklistEntry *entry, void *data, size_t sz) { quicklist* quicklist = iter->quicklist; + quicklistNode *node = entry->node; + unsigned char *newentry; - if (likely(!QL_NODE_IS_PLAIN(entry->node) && !isLargeElement(sz))) { - entry->node->entry = lpReplace(entry->node->entry, &entry->zi, data, sz); + if (likely(!QL_NODE_IS_PLAIN(entry->node) && !isLargeElement(sz, quicklist->fill) && + (newentry = lpReplace(entry->node->entry, &entry->zi, data, sz)) != NULL)) + { + entry->node->entry = newentry; quicklistNodeUpdateSz(entry->node); /* quicklistNext() and quicklistGetIteratorEntryAtIdx() provide an uncompressed node */ quicklistCompress(quicklist, entry->node); } else if (QL_NODE_IS_PLAIN(entry->node)) { - if (isLargeElement(sz)) { + if (isLargeElement(sz, quicklist->fill)) { zfree(entry->node->entry); entry->node->entry = zmalloc(sz); entry->node->sz = sz; @@ -758,17 +793,37 @@ void quicklistReplaceEntry(quicklistIter *iter, quicklistEntry *entry, quicklistInsertAfter(iter, entry, data, sz); __quicklistDelNode(quicklist, entry->node); } - } else { - entry->node->dont_compress = 1; /* Prevent compression in quicklistInsertAfter() */ - quicklistInsertAfter(iter, entry, data, sz); + } else { /* The node is full or data is a large element */ + quicklistNode *split_node = NULL, *new_node; + node->dont_compress = 1; /* Prevent compression in __quicklistInsertNode() */ + + /* If the entry is not at the tail, split the node at the entry's offset. */ + if (entry->offset != node->count - 1 && entry->offset != -1) + split_node = _quicklistSplitNode(node, entry->offset, 1); + + /* Create a new node and insert it after the original node. + * If the original node was split, insert the split node after the new node. */ + new_node = __quicklistCreateNode(isLargeElement(sz, quicklist->fill) ? + QUICKLIST_NODE_CONTAINER_PLAIN : QUICKLIST_NODE_CONTAINER_PACKED, data, sz); + __quicklistInsertNode(quicklist, node, new_node, 1); + if (split_node) __quicklistInsertNode(quicklist, new_node, split_node, 1); + quicklist->count++; + + /* Delete the replaced element. */ if (entry->node->count == 1) { __quicklistDelNode(quicklist, entry->node); } else { unsigned char *p = lpSeek(entry->node->entry, -1); quicklistDelIndex(quicklist, entry->node, &p); entry->node->dont_compress = 0; /* Re-enable compression */ - quicklistCompress(quicklist, entry->node); - quicklistCompress(quicklist, entry->node->next); + new_node = _quicklistMergeNodes(quicklist, new_node); + /* We can't know if the current node and its sibling nodes are correctly compressed, + * and we don't know if they are within the range of compress depth, so we need to + * use quicklistCompress() for compression, which checks if node is within compress + * depth before compressing. */ + quicklistCompress(quicklist, new_node); + quicklistCompress(quicklist, new_node->prev); + if (new_node->next) quicklistCompress(quicklist, new_node->next); } } @@ -826,6 +881,8 @@ REDIS_STATIC quicklistNode *_quicklistListpackMerge(quicklist *quicklist, } keep->count = lpLength(keep->entry); quicklistNodeUpdateSz(keep); + keep->recompress = 0; /* Prevent 'keep' from being recompressed if + * it becomes head or tail after merging. */ nokeep->count = 0; __quicklistDelNode(quicklist, nokeep); @@ -844,9 +901,10 @@ REDIS_STATIC quicklistNode *_quicklistListpackMerge(quicklist *quicklist, * - (center->next, center->next->next) * - (center->prev, center) * - (center, center->next) + * + * Returns the new 'center' after merging. */ -REDIS_STATIC void _quicklistMergeNodes(quicklist *quicklist, - quicklistNode *center) { +REDIS_STATIC quicklistNode *_quicklistMergeNodes(quicklist *quicklist, quicklistNode *center) { int fill = quicklist->fill; quicklistNode *prev, *prev_prev, *next, *next_next, *target; prev = prev_prev = next = next_next = target = NULL; @@ -886,8 +944,9 @@ REDIS_STATIC void _quicklistMergeNodes(quicklist *quicklist, /* Use result of center merge (or original) to merge with next node. */ if (_quicklistNodeAllowMerge(target, target->next, fill)) { - _quicklistListpackMerge(quicklist, target, target->next); + target = _quicklistListpackMerge(quicklist, target, target->next); } + return target; } /* Split 'node' into two parts, parameterized by 'offset' and 'after'. @@ -959,7 +1018,7 @@ REDIS_STATIC void _quicklistInsert(quicklistIter *iter, quicklistEntry *entry, if (!node) { /* we have no reference node, so let's create only node in the list */ D("No node given!"); - if (unlikely(isLargeElement(sz))) { + if (unlikely(isLargeElement(sz, quicklist->fill))) { __quicklistInsertPlainNode(quicklist, quicklist->tail, value, sz, after); return; } @@ -996,13 +1055,13 @@ REDIS_STATIC void _quicklistInsert(quicklistIter *iter, quicklistEntry *entry, } } - if (unlikely(isLargeElement(sz))) { + if (unlikely(isLargeElement(sz, quicklist->fill))) { if (QL_NODE_IS_PLAIN(node) || (at_tail && after) || (at_head && !after)) { __quicklistInsertPlainNode(quicklist, node, value, sz, after); } else { quicklistDecompressNodeForUse(node); new_node = _quicklistSplitNode(node, entry->offset, after); - quicklistNode *entry_node = __quicklistCreatePlainNode(value, sz); + quicklistNode *entry_node = __quicklistCreateNode(QUICKLIST_NODE_CONTAINER_PLAIN, value, sz); __quicklistInsertNode(quicklist, node, entry_node, after); __quicklistInsertNode(quicklist, entry_node, new_node, after); quicklist->count++; @@ -2061,20 +2120,23 @@ int quicklistTest(int argc, char *argv[], int flags) { } TEST("Comprassion Plain node") { - char buf[256]; - quicklistisSetPackedThreshold(1); - quicklist *ql = quicklistNew(-2, 1); + for (int f = 0; f < fill_count; f++) { + size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; + + char buf[large_limit]; + quicklist *ql = quicklistNew(fills[f], 1); for (int i = 0; i < 500; i++) { /* Set to 256 to allow the node to be triggered to compress, * if it is less than 48(nocompress), the test will be successful. */ snprintf(buf, sizeof(buf), "hello%d", i); - quicklistPushHead(ql, buf, 256); + quicklistPushHead(ql, buf, large_limit); } quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); quicklistEntry entry; int i = 0; while (quicklistNext(iter, &entry)) { + assert(QL_NODE_IS_PLAIN(entry.node)); snprintf(buf, sizeof(buf), "hello%d", i); if (strcmp((char *)entry.value, buf)) ERR("value [%s] didn't match [%s] at position %d", @@ -2084,42 +2146,57 @@ int quicklistTest(int argc, char *argv[], int flags) { ql_release_iterator(iter); quicklistRelease(ql); } + } - TEST("NEXT plain node") - { - packed_threshold = 3; - quicklist *ql = quicklistNew(-2, options[_i]); - char *strings[] = {"hello1", "hello2", "h3", "h4", "hello5"}; + TEST("NEXT plain node") { + for (int f = 0; f < fill_count; f++) { + size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; + quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 5; ++i) - quicklistPushHead(ql, strings[i], strlen(strings[i])); + char buf[large_limit]; + memcpy(buf, "plain", 5); + quicklistPushHead(ql, buf, large_limit); + quicklistPushHead(ql, buf, large_limit); + quicklistPushHead(ql, "packed3", 7); + quicklistPushHead(ql, "packed4", 7); + quicklistPushHead(ql, buf, large_limit); quicklistEntry entry; quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); - int j = 0; while(quicklistNext(iter, &entry) != 0) { - assert(strncmp(strings[j], (char *)entry.value, strlen(strings[j])) == 0); - j++; + if (QL_NODE_IS_PLAIN(entry.node)) + assert(!memcmp(entry.value, "plain", 5)); + else + assert(!memcmp(entry.value, "packed", 6)); } ql_release_iterator(iter); quicklistRelease(ql); } + } TEST("rotate plain node ") { + for (int f = 0; f < fill_count; f++) { + size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; + unsigned char *data = NULL; size_t sz; long long lv; int i =0; - packed_threshold = 5; - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushHead(ql, "hello1", 6); - quicklistPushHead(ql, "hello4", 6); - quicklistPushHead(ql, "hello3", 6); - quicklistPushHead(ql, "hello2", 6); + quicklist *ql = quicklistNew(fills[f], options[_i]); + char buf[large_limit]; + memcpy(buf, "hello1", 6); + quicklistPushHead(ql, buf, large_limit); + memcpy(buf, "hello4", 6); + quicklistPushHead(ql, buf, large_limit); + memcpy(buf, "hello3", 6); + quicklistPushHead(ql, buf, large_limit); + memcpy(buf, "hello2", 6); + quicklistPushHead(ql, buf, large_limit); quicklistRotate(ql); for(i = 1 ; i < 5; i++) { + assert(QL_NODE_IS_PLAIN(ql->tail)); quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); int temp_char = data[5]; zfree(data); @@ -2128,7 +2205,7 @@ int quicklistTest(int argc, char *argv[], int flags) { ql_verify(ql, 0, 0, 0, 0); quicklistRelease(ql); - packed_threshold = (1 << 30); + } } TEST("rotate one val once") { @@ -3224,7 +3301,7 @@ int quicklistTest(int argc, char *argv[], int flags) { memcpy(s, "helloworld", 10); memcpy(s + sz - 10, "1234567890", 10); - quicklistNode *node = __quicklistCreatePlainNode(s, sz); + quicklistNode *node = __quicklistCreateNode(QUICKLIST_NODE_CONTAINER_PLAIN, s, sz); /* Just to avoid triggering the assertion in __quicklistCompressNode(), * it disables the passing of quicklist head or tail node. */ diff --git a/src/quicklist.h b/src/quicklist.h index f17834b9943..c4b07e0c009 100644 --- a/src/quicklist.h +++ b/src/quicklist.h @@ -42,7 +42,8 @@ * container: 2 bits, PLAIN=1 (a single item as char array), PACKED=2 (listpack with multiple items). * recompress: 1 bit, bool, true if node is temporary decompressed for usage. * attempted_compress: 1 bit, boolean, used for verifying during testing. - * extra: 10 bits, free for future use; pads out the remainder of 32 bits */ + * dont_compress: 1 bit, boolean, used for preventing compression of entry. + * extra: 9 bits, free for future use; pads out the remainder of 32 bits */ typedef struct quicklistNode { struct quicklistNode *prev; struct quicklistNode *next; @@ -154,9 +155,9 @@ typedef struct quicklistEntry { /* Prototypes */ quicklist *quicklistCreate(void); quicklist *quicklistNew(int fill, int compress); -void quicklistSetCompressDepth(quicklist *quicklist, int depth); +void quicklistSetCompressDepth(quicklist *quicklist, int compress); void quicklistSetFill(quicklist *quicklist, int fill); -void quicklistSetOptions(quicklist *quicklist, int fill, int depth); +void quicklistSetOptions(quicklist *quicklist, int fill, int compress); void quicklistRelease(quicklist *quicklist); int quicklistPushHead(quicklist *quicklist, void *value, const size_t sz); int quicklistPushTail(quicklist *quicklist, void *value, const size_t sz); @@ -201,7 +202,7 @@ int quicklistBookmarkCreate(quicklist **ql_ref, const char *name, quicklistNode int quicklistBookmarkDelete(quicklist *ql, const char *name); quicklistNode *quicklistBookmarkFind(quicklist *ql, const char *name); void quicklistBookmarksClear(quicklist *ql); -int quicklistisSetPackedThreshold(size_t sz); +int quicklistSetPackedThreshold(size_t sz); #ifdef REDIS_TEST int quicklistTest(int argc, char *argv[], int flags); diff --git a/src/rand.c b/src/rand.c index e1e98e63b99..6256c3bc3c2 100644 --- a/src/rand.c +++ b/src/rand.c @@ -13,7 +13,7 @@ * * ---------------------------------------------------------------------------- * - * Copyright (c) 2010-2012, Salvatore Sanfilippo + * Copyright (c) 2010-current, Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/rand.h b/src/rand.h index 9884915a97d..ccacbf28a45 100644 --- a/src/rand.h +++ b/src/rand.h @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef REDIS_RANDOM_H diff --git a/src/rax.c b/src/rax.c index 287f9855d5b..491e50aa05d 100644 --- a/src/rax.c +++ b/src/rax.c @@ -1,42 +1,19 @@ /* Rax -- A radix tree implementation. * - * Version 1.2 -- 7 February 2019 - * - * Copyright (c) 2017-2019, Salvatore Sanfilippo + * Copyright (c) 2017-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include #include -#include #include #include #include #include "rax.h" +#include "redisassert.h" #ifndef RAX_MALLOC_INCLUDE #define RAX_MALLOC_INCLUDE "rax_malloc.h" @@ -44,11 +21,6 @@ #include RAX_MALLOC_INCLUDE -/* This is a special pointer that is guaranteed to never have the same value - * of a radix tree node. It's used in order to report "not found" error without - * requiring the function to have multiple return values. */ -void *raxNotFound = (void*)"rax-not-found-pointer"; - /* -------------------------------- Debugging ------------------------------ */ void raxDebugShowNode(const char *msg, raxNode *n); @@ -201,11 +173,16 @@ raxNode *raxNewNode(size_t children, int datafield) { /* Allocate a new rax and return its pointer. On out of memory the function * returns NULL. */ rax *raxNew(void) { - rax *rax = rax_malloc(sizeof(*rax)); + return raxNewWithMetadata(0); +} + +/* Allocate a new rax with metadata */ +rax *raxNewWithMetadata(int metaSize) { + rax *rax = rax_malloc(sizeof(*rax) + metaSize); if (rax == NULL) return NULL; rax->numele = 0; rax->numnodes = 1; - rax->head = raxNewNode(0,0); + rax->head = raxNewNode(0, 0); if (rax->head == NULL) { rax_free(rax); return NULL; @@ -912,18 +889,19 @@ int raxTryInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old) return raxGenericInsert(rax,s,len,data,old,0); } -/* Find a key in the rax, returns raxNotFound special void pointer value - * if the item was not found, otherwise the value associated with the - * item is returned. */ -void *raxFind(rax *rax, unsigned char *s, size_t len) { +/* Find a key in the rax: return 1 if the item is found, 0 otherwise. + * If there is an item and 'value' is passed in a non-NULL pointer, + * the value associated with the item is set at that address. */ +int raxFind(rax *rax, unsigned char *s, size_t len, void **value) { raxNode *h; debugf("### Lookup: %.*s\n", (int)len, s); int splitpos = 0; size_t i = raxLowWalk(rax,s,len,&h,NULL,&splitpos,NULL); if (i != len || (h->iscompr && splitpos != 0) || !h->iskey) - return raxNotFound; - return raxGetData(h); + return 0; + if (value != NULL) *value = raxGetData(h); + return 1; } /* Return the memory address where the 'parent' node stores the specified @@ -1237,6 +1215,25 @@ void raxRecursiveFree(rax *rax, raxNode *n, void (*free_callback)(void*)) { rax->numnodes--; } +/* Same as raxRecursiveFree() with context argument */ +void raxRecursiveFreeWithCtx(rax *rax, raxNode *n, + void (*free_callback)(void *item, void *ctx), void *ctx) { + debugnode("free traversing",n); + int numchildren = n->iscompr ? 1 : n->size; + raxNode **cp = raxNodeLastChildPtr(n); + while(numchildren--) { + raxNode *child; + memcpy(&child,cp,sizeof(child)); + raxRecursiveFreeWithCtx(rax,child,free_callback, ctx); + cp--; + } + debugnode("free depth-first",n); + if (free_callback && n->iskey && !n->isnull) + free_callback(raxGetData(n), ctx); + rax_free(n); + rax->numnodes--; +} + /* Free a whole radix tree, calling the specified callback in order to * free the auxiliary data. */ void raxFreeWithCallback(rax *rax, void (*free_callback)(void*)) { @@ -1245,6 +1242,15 @@ void raxFreeWithCallback(rax *rax, void (*free_callback)(void*)) { rax_free(rax); } +/* Free a whole radix tree, calling the specified callback in order to + * free the auxiliary data. */ +void raxFreeWithCbAndContext(rax *rax, + void (*free_callback)(void *item, void *ctx), void *ctx) { + raxRecursiveFreeWithCtx(rax,rax->head,free_callback,ctx); + assert(rax->numnodes == 0); + rax_free(rax); +} + /* Free a whole radix tree. */ void raxFree(rax *rax) { raxFreeWithCallback(rax,NULL); diff --git a/src/rax.h b/src/rax.h index 6b1fd4188cc..74963acaddb 100644 --- a/src/rax.h +++ b/src/rax.h @@ -1,31 +1,10 @@ /* Rax -- A radix tree implementation. * - * Copyright (c) 2017-2018, Salvatore Sanfilippo + * Copyright (c) 2017-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef RAX_H @@ -134,6 +113,7 @@ typedef struct rax { raxNode *head; uint64_t numele; uint64_t numnodes; + void *metadata[]; } rax; /* Stack data structure used by raxLowWalk() in order to, optionally, return @@ -185,17 +165,18 @@ typedef struct raxIterator { raxNodeCallback node_cb; /* Optional node callback. Normally set to NULL. */ } raxIterator; -/* A special pointer returned for not found items. */ -extern void *raxNotFound; - /* Exported API. */ rax *raxNew(void); +rax *raxNewWithMetadata(int metaSize); int raxInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old); int raxTryInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old); int raxRemove(rax *rax, unsigned char *s, size_t len, void **old); -void *raxFind(rax *rax, unsigned char *s, size_t len); +int raxFind(rax *rax, unsigned char *s, size_t len, void **value); void raxFree(rax *rax); void raxFreeWithCallback(rax *rax, void (*free_callback)(void*)); +void raxFreeWithCbAndContext(rax *rax, + void (*free_callback)(void *item, void *ctx), + void *ctx); void raxStart(raxIterator *it, rax *rt); int raxSeek(raxIterator *it, const char *op, unsigned char *ele, size_t len); int raxNext(raxIterator *it); diff --git a/src/rax_malloc.h b/src/rax_malloc.h index 9295985c653..a45bc98db2b 100644 --- a/src/rax_malloc.h +++ b/src/rax_malloc.h @@ -1,31 +1,10 @@ /* Rax -- A radix tree implementation. * - * Copyright (c) 2017, Salvatore Sanfilippo + * Copyright (c) 2017-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ /* Allocator selection. diff --git a/src/rdb.c b/src/rdb.c index cfc92e815bc..c5c0b04f66b 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -137,7 +116,7 @@ time_t rdbLoadTime(rio *rdb) { return (time_t)t32; } -int rdbSaveMillisecondTime(rio *rdb, long long t) { +ssize_t rdbSaveMillisecondTime(rio *rdb, long long t) { int64_t t64 = (int64_t) t; memrev64ifbe(&t64); /* Store in little endian. */ return rdbWriteRaw(rdb,&t64,8); @@ -289,8 +268,9 @@ int rdbEncodeInteger(long long value, unsigned char *enc) { * The returned value changes according to the flags, see * rdbGenericLoadStringObject() for more info. */ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) { - int plain = flags & RDB_LOAD_PLAIN; - int sds = flags & RDB_LOAD_SDS; + int plainFlag = flags & RDB_LOAD_PLAIN; + int sdsFlag = flags & RDB_LOAD_SDS; + int hfldFlag = flags & (RDB_LOAD_HFLD|RDB_LOAD_HFLD_TTL); int encode = flags & RDB_LOAD_ENC; unsigned char enc[4]; long long val; @@ -316,11 +296,17 @@ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) { rdbReportCorruptRDB("Unknown RDB integer encoding type %d",enctype); return NULL; /* Never reached. */ } - if (plain || sds) { + if (plainFlag || sdsFlag || hfldFlag) { char buf[LONG_STR_SIZE], *p; int len = ll2string(buf,sizeof(buf),val); if (lenptr) *lenptr = len; - p = plain ? zmalloc(len) : sdsnewlen(SDS_NOINIT,len); + if (plainFlag) { + p = zmalloc(len); + } else if (sdsFlag) { + p = sdsnewlen(SDS_NOINIT,len); + } else { /* hfldFlag */ + p = hfieldNew(NULL, len, (flags&RDB_LOAD_HFLD) ? 0 : 1); + } memcpy(p,buf,len); return p; } else if (encode) { @@ -389,8 +375,11 @@ ssize_t rdbSaveLzfStringObject(rio *rdb, unsigned char *s, size_t len) { * changes according to 'flags'. For more info check the * rdbGenericLoadStringObject() function. */ void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) { - int plain = flags & RDB_LOAD_PLAIN; - int sds = flags & RDB_LOAD_SDS; + int plainFlag = flags & RDB_LOAD_PLAIN; + int sdsFlag = flags & RDB_LOAD_SDS; + int hfldFlag = flags & (RDB_LOAD_HFLD | RDB_LOAD_HFLD_TTL); + int robjFlag = (!(plainFlag || sdsFlag || hfldFlag)); /* not plain/sds/hfld */ + uint64_t len, clen; unsigned char *c = NULL; char *val = NULL; @@ -403,11 +392,14 @@ void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) { } /* Allocate our target according to the uncompressed size. */ - if (plain) { + if (plainFlag) { val = ztrymalloc(len); - } else { + } else if (sdsFlag || robjFlag) { val = sdstrynewlen(SDS_NOINIT,len); + } else { /* hfldFlag */ + val = hfieldTryNew(NULL, len, (flags&RDB_LOAD_HFLD) ? 0 : 1); } + if (!val) { serverLog(isRestoreContext()? LL_VERBOSE: LL_WARNING, "rdbLoadLzfStringObject failed allocating %llu bytes", (unsigned long long)len); goto err; @@ -423,17 +415,17 @@ void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) { } zfree(c); - if (plain || sds) { - return val; - } else { - return createObject(OBJ_STRING,val); - } + return (robjFlag) ? createObject(OBJ_STRING,val) : (void *) val; + err: zfree(c); - if (plain) + if (plainFlag) { zfree(val); - else + } else if (sdsFlag || robjFlag) { sdsfree(val); + } else { /* hfldFlag*/ + hfieldFree(val); + } return NULL; } @@ -512,12 +504,18 @@ ssize_t rdbSaveStringObject(rio *rdb, robj *obj) { * RDB_LOAD_PLAIN: Return a plain string allocated with zmalloc() * instead of a Redis object with an sds in it. * RDB_LOAD_SDS: Return an SDS string instead of a Redis object. + * RDB_LOAD_HFLD: Return a hash field object (mstr) + * RDB_LOAD_HFLD_TTL: Return a hash field with TTL metadata reserved * * On I/O error NULL is returned. */ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) { - int plain = flags & RDB_LOAD_PLAIN; - int sds = flags & RDB_LOAD_SDS; + void *buf; + int plainFlag = flags & RDB_LOAD_PLAIN; + int sdsFlag = flags & RDB_LOAD_SDS; + int hfldFlag = flags & (RDB_LOAD_HFLD|RDB_LOAD_HFLD_TTL); + int robjFlag = (!(plainFlag || sdsFlag || hfldFlag)); /* not plain/sds/hfld */ + int isencoded; unsigned long long len; @@ -538,22 +536,8 @@ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) { } } - if (plain || sds) { - void *buf = plain ? ztrymalloc(len) : sdstrynewlen(SDS_NOINIT,len); - if (!buf) { - serverLog(isRestoreContext()? LL_VERBOSE: LL_WARNING, "rdbGenericLoadStringObject failed allocating %llu bytes", len); - return NULL; - } - if (lenptr) *lenptr = len; - if (len && rioRead(rdb,buf,len) == 0) { - if (plain) - zfree(buf); - else - sdsfree(buf); - return NULL; - } - return buf; - } else { + /* return robj */ + if (robjFlag) { robj *o = tryCreateStringObject(SDS_NOINIT,len); if (!o) { serverLog(isRestoreContext()? LL_VERBOSE: LL_WARNING, "rdbGenericLoadStringObject failed allocating %llu bytes", len); @@ -565,6 +549,32 @@ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) { } return o; } + + /* plain/sds/hfld */ + if (plainFlag) { + buf = ztrymalloc(len); + } else if (sdsFlag) { + buf = sdstrynewlen(SDS_NOINIT,len); + } else { /* hfldFlag */ + buf = hfieldTryNew(NULL, len, (flags&RDB_LOAD_HFLD) ? 0 : 1); + } + if (!buf) { + serverLog(isRestoreContext()? LL_VERBOSE: LL_WARNING, "rdbGenericLoadStringObject failed allocating %llu bytes", len); + return NULL; + } + + if (lenptr) *lenptr = len; + if (len && rioRead(rdb,buf,len) == 0) { + if (plainFlag) + zfree(buf); + else if (sdsFlag) { + sdsfree(buf); + } else { /* hfldFlag */ + hfieldFree(buf); + } + return NULL; + } + return buf; } robj *rdbLoadStringObject(rio *rdb) { @@ -583,7 +593,7 @@ robj *rdbLoadEncodedStringObject(rio *rdb) { * 254: + inf * 255: - inf */ -int rdbSaveDoubleValue(rio *rdb, double val) { +ssize_t rdbSaveDoubleValue(rio *rdb, double val) { unsigned char buf[128]; int len; @@ -686,9 +696,14 @@ int rdbSaveObjectType(rio *rdb, robj *o) { case OBJ_HASH: if (o->encoding == OBJ_ENCODING_LISTPACK) return rdbSaveType(rdb,RDB_TYPE_HASH_LISTPACK); - else if (o->encoding == OBJ_ENCODING_HT) - return rdbSaveType(rdb,RDB_TYPE_HASH); - else + else if (o->encoding == OBJ_ENCODING_LISTPACK_EX) + return rdbSaveType(rdb,RDB_TYPE_HASH_LISTPACK_EX); + else if (o->encoding == OBJ_ENCODING_HT) { + if (hashTypeGetMinExpire(o, 0) == EB_EXPIRE_TIME_INVALID) + return rdbSaveType(rdb,RDB_TYPE_HASH); + else + return rdbSaveType(rdb,RDB_TYPE_HASH_METADATA); + } else serverPanic("Unknown hash encoding"); case OBJ_STREAM: return rdbSaveType(rdb,RDB_TYPE_STREAM_LISTPACKS_3); @@ -929,32 +944,58 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { } } else if (o->type == OBJ_HASH) { /* Save a hash value */ - if (o->encoding == OBJ_ENCODING_LISTPACK) { - size_t l = lpBytes((unsigned char*)o->ptr); + if ((o->encoding == OBJ_ENCODING_LISTPACK) || + (o->encoding == OBJ_ENCODING_LISTPACK_EX)) + { + unsigned char *lp_ptr = hashTypeListpackGetLp(o); + size_t l = lpBytes(lp_ptr); - if ((n = rdbSaveRawString(rdb,o->ptr,l)) == -1) return -1; + if ((n = rdbSaveRawString(rdb,lp_ptr,l)) == -1) return -1; nwritten += n; } else if (o->encoding == OBJ_ENCODING_HT) { dictIterator *di = dictGetIterator(o->ptr); dictEntry *de; - + /* Determine the hash layout to use based on the presence of at least + * one field with a valid TTL. If such a field exists, employ the + * RDB_TYPE_HASH_METADATA layout, including tuples of [ttl][field][value]. + * Otherwise, use the standard RDB_TYPE_HASH layout containing only + * the tuples [field][value]. */ + int with_ttl = (hashTypeGetMinExpire(o, 0) != EB_EXPIRE_TIME_INVALID); + + /* save number of fields in hash */ if ((n = rdbSaveLen(rdb,dictSize((dict*)o->ptr))) == -1) { dictReleaseIterator(di); return -1; } nwritten += n; + /* save all hash fields */ while((de = dictNext(di)) != NULL) { - sds field = dictGetKey(de); + hfield field = dictGetKey(de); sds value = dictGetVal(de); + /* save the TTL */ + if (with_ttl) { + uint64_t ttl = hfieldGetExpireTime(field); + /* 0 is used to indicate no TTL is set for this field */ + if (ttl == EB_EXPIRE_TIME_INVALID) ttl = 0; + if ((n = rdbSaveLen(rdb, ttl)) == -1) { + dictReleaseIterator(di); + return -1; + } + nwritten += n; + } + + /* save the key */ if ((n = rdbSaveRawString(rdb,(unsigned char*)field, - sdslen(field))) == -1) + hfieldlen(field))) == -1) { dictReleaseIterator(di); return -1; } nwritten += n; + + /* save the value */ if ((n = rdbSaveRawString(rdb,(unsigned char*)value, sdslen(value))) == -1) { @@ -1298,17 +1339,16 @@ ssize_t rdbSaveFunctions(rio *rdb) { } ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) { - dictIterator *di; dictEntry *de; ssize_t written = 0; ssize_t res; + kvstoreIterator *kvs_it = NULL; static long long info_updated_time = 0; char *pname = (rdbflags & RDBFLAGS_AOF_PREAMBLE) ? "AOF rewrite" : "RDB"; redisDb *db = server.db + dbid; - dict *d = db->dict; - if (dictSize(d) == 0) return 0; - di = dictGetSafeIterator(d); + unsigned long long int db_size = kvstoreSize(db->keys); + if (db_size == 0) return 0; /* Write the SELECT DB opcode */ if ((res = rdbSaveType(rdb,RDB_OPCODE_SELECTDB)) < 0) goto werr; @@ -1317,9 +1357,7 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) { written += res; /* Write the RESIZE DB opcode. */ - uint64_t db_size, expires_size; - db_size = dictSize(db->dict); - expires_size = dictSize(db->expires); + unsigned long long expires_size = kvstoreSize(db->expires); if ((res = rdbSaveType(rdb,RDB_OPCODE_RESIZEDB)) < 0) goto werr; written += res; if ((res = rdbSaveLen(rdb,db_size)) < 0) goto werr; @@ -1327,8 +1365,23 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) { if ((res = rdbSaveLen(rdb,expires_size)) < 0) goto werr; written += res; + kvs_it = kvstoreIteratorInit(db->keys); + int last_slot = -1; /* Iterate this DB writing every entry */ - while((de = dictNext(di)) != NULL) { + while ((de = kvstoreIteratorNext(kvs_it)) != NULL) { + int curr_slot = kvstoreIteratorGetCurrentDictIndex(kvs_it); + /* Save slot info. */ + if (server.cluster_enabled && curr_slot != last_slot) { + if ((res = rdbSaveType(rdb, RDB_OPCODE_SLOT_INFO)) < 0) goto werr; + written += res; + if ((res = rdbSaveLen(rdb, curr_slot)) < 0) goto werr; + written += res; + if ((res = rdbSaveLen(rdb, kvstoreDictSize(db->keys, curr_slot))) < 0) goto werr; + written += res; + if ((res = rdbSaveLen(rdb, kvstoreDictSize(db->expires, curr_slot))) < 0) goto werr; + written += res; + last_slot = curr_slot; + } sds keystr = dictGetKey(de); robj key, *o = dictGetVal(de); long long expire; @@ -1356,12 +1409,11 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) { } } } - - dictReleaseIterator(di); + kvstoreIteratorRelease(kvs_it); return written; werr: - dictReleaseIterator(di); + if (kvs_it) kvstoreIteratorRelease(kvs_it); return -1; } @@ -1413,7 +1465,8 @@ int rdbSaveRio(int req, rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi) { return C_ERR; } -/* This is just a wrapper to rdbSaveRio() that additionally adds a prefix +/* This helper function is only used for diskless replication. + * This is just a wrapper to rdbSaveRio() that additionally adds a prefix * and a suffix to the generated RDB dump. The prefix is: * * $EOF:<40 bytes unguessable hex string>\r\n @@ -1430,7 +1483,7 @@ int rdbSaveRioWithEOFMark(int req, rio *rdb, int *error, rdbSaveInfo *rsi) { if (rioWrite(rdb,"$EOF:",5) == 0) goto werr; if (rioWrite(rdb,eofmark,RDB_EOF_MARK_SIZE) == 0) goto werr; if (rioWrite(rdb,"\r\n",2) == 0) goto werr; - if (rdbSaveRio(req,rdb,error,RDBFLAGS_NONE,rsi) == C_ERR) goto werr; + if (rdbSaveRio(req,rdb,error,RDBFLAGS_REPLICATION,rsi) == C_ERR) goto werr; if (rioWrite(rdb,eofmark,RDB_EOF_MARK_SIZE) == 0) goto werr; stopSaving(1); return C_OK; @@ -1517,7 +1570,7 @@ int rdbSave(int req, char *filename, rdbSaveInfo *rsi, int rdbflags) { char tmpfile[256]; char cwd[MAXPATHLEN]; /* Current working dir path for error messages. */ - startSaving(RDBFLAGS_NONE); + startSaving(rdbflags); snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid()); if (rdbSaveInternal(req,tmpfile,rsi,rdbflags) != C_OK) { @@ -1762,19 +1815,20 @@ static int _listZiplistEntryConvertAndValidate(unsigned char *p, unsigned int he /* callback for to check the listpack doesn't have duplicate records */ static int _lpEntryValidation(unsigned char *p, unsigned int head_count, void *userdata) { struct { - int pairs; + int tuple_len; long count; dict *fields; + long long last_expireat; } *data = userdata; if (data->fields == NULL) { data->fields = dictCreate(&hashDictType); - dictExpand(data->fields, data->pairs ? head_count/2 : head_count); + dictExpand(data->fields, head_count/data->tuple_len); } /* If we're checking pairs, then even records are field names. Otherwise * we're checking all elements. Add to dict and check that's not a dup */ - if (!data->pairs || ((data->count) & 1) == 0) { + if (data->count % data->tuple_len == 0) { unsigned char *str; int64_t slen; unsigned char buf[LP_INTBUF_SIZE]; @@ -1788,6 +1842,19 @@ static int _lpEntryValidation(unsigned char *p, unsigned int head_count, void *u } } + /* Validate TTL field, only for listpackex. */ + if (data->count % data->tuple_len == 2) { + long long expire_at; + /* Must be an integer. */ + if (!lpGetIntegerValue(p, &expire_at)) return 0; + /* Must be less than EB_EXPIRE_TIME_MAX. */ + if (expire_at < 0 || (unsigned long long)expire_at > EB_EXPIRE_TIME_MAX) return 0; + /* TTL fields are ordered. If the current field has TTL, the previous field must + * also have one, and the current TTL must be greater than the previous one. */ + if (expire_at != 0 && (data->last_expireat == 0 || expire_at < data->last_expireat)) return 0; + data->last_expireat = expire_at; + } + (data->count)++; return 1; } @@ -1795,23 +1862,25 @@ static int _lpEntryValidation(unsigned char *p, unsigned int head_count, void *u /* Validate the integrity of the listpack structure. * when `deep` is 0, only the integrity of the header is validated. * when `deep` is 1, we scan all the entries one by one. - * when `pairs` is 0, all elements need to be unique (it's a set) - * when `pairs` is 1, odd elements need to be unique (it's a key-value map) */ -int lpValidateIntegrityAndDups(unsigned char *lp, size_t size, int deep, int pairs) { + * tuple_len indicates what is a logical entry tuple size. + * Whether tuple is of size 1 (set), 2 (feild-value) or 3 (field-value[-ttl]), + * first element in the tuple must be unique */ +int lpValidateIntegrityAndDups(unsigned char *lp, size_t size, int deep, int tuple_len) { if (!deep) return lpValidateIntegrity(lp, size, 0, NULL, NULL); /* Keep track of the field names to locate duplicate ones */ struct { - int pairs; + int tuple_len; long count; dict *fields; /* Initialisation at the first callback. */ - } data = {pairs, 0, NULL}; + long long last_expireat; /* Last field's expiry time to ensure order in TTL fields. */ + } data = {tuple_len, 0, NULL, -1}; int ret = lpValidateIntegrity(lp, size, 1, _lpEntryValidation, &data); - /* make sure we have an even number of records. */ - if (pairs && data.count & 1) + /* the number of records should be a multiple of the tuple length */ + if (data.count % tuple_len != 0) ret = 0; if (data.fields) dictRelease(data.fields); @@ -1820,9 +1889,16 @@ int lpValidateIntegrityAndDups(unsigned char *lp, size_t size, int deep, int pai /* Load a Redis object of the specified type from the specified file. * On success a newly allocated object is returned, otherwise NULL. - * When the function returns NULL and if 'error' is not NULL, the - * integer pointed by 'error' is set to the type of error that occurred */ -robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { + * + * error - When the function returns NULL and if 'error' is not NULL, the + * integer pointed by 'error' is set to the type of error that occurred + * minExpiredField - If loading a hash with expiration on fields, then this value + * will be set to the minimum expire time found in the hash fields. If there are + * no fields with expiration or it is not a hash, then it will set be to + * EB_EXPIRE_TIME_INVALID. + */ +robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) +{ robj *o = NULL, *ele, *dec; uint64_t len; unsigned int i; @@ -1850,9 +1926,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL; if (len == 0) goto emptykey; - o = createQuicklistObject(); - quicklistSetOptions(o->ptr, server.list_max_listpack_size, - server.list_compress_depth); + o = createQuicklistObject(server.list_max_listpack_size, server.list_compress_depth); /* Load every single element of the list */ while(len--) { @@ -1867,7 +1941,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { decrRefCount(ele); } - listTypeTryConversion(o,LIST_CONV_AUTO,NULL,NULL); + listTypeTryConversion(o, LIST_CONV_AUTO, NULL, NULL); } else if (rdbtype == RDB_TYPE_SET) { /* Read Set value */ if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL; @@ -1880,7 +1954,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o = createSetObject(); /* It's faster to expand the dict to the right size asap in order * to avoid rehashing */ - if (len > DICT_HT_INITIAL_SIZE && dictTryExpand(o->ptr,len) != DICT_OK) { + if (len > DICT_HT_INITIAL_SIZE && dictTryExpand(o->ptr, len) != DICT_OK) { rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); decrRefCount(o); return NULL; @@ -1907,7 +1981,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* Fetch integer value from element. */ if (isSdsRepresentableAsLongLong(sdsele,&llval) == C_OK) { uint8_t success; - o->ptr = intsetAdd(o->ptr,llval,&success); + o->ptr = intsetAdd(o->ptr, llval, &success); if (!success) { rdbReportCorruptRDB("Duplicate set members detected"); decrRefCount(o); @@ -1957,7 +2031,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* This will also be called when the set was just converted * to a regular hash table encoded set. */ if (o->encoding == OBJ_ENCODING_HT) { - if (dictAdd((dict*)o->ptr,sdsele,NULL) != DICT_OK) { + if (dictAdd((dict*)o->ptr, sdsele, NULL) != DICT_OK) { rdbReportCorruptRDB("Duplicate set members detected"); decrRefCount(o); sdsfree(sdsele); @@ -2035,12 +2109,13 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { maxelelen <= server.zset_max_listpack_value && lpSafeToAdd(NULL, totelelen)) { - zsetConvert(o,OBJ_ENCODING_LISTPACK); + zsetConvert(o, OBJ_ENCODING_LISTPACK); } } else if (rdbtype == RDB_TYPE_HASH) { uint64_t len; int ret; - sds field, value; + sds value; + hfield field; dict *dupSearchDict = NULL; len = rdbLoadLen(rdb, NULL); @@ -2051,7 +2126,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* Too many entries? Use a hash table right from the start. */ if (len > server.hash_max_listpack_entries) - hashTypeConvert(o, OBJ_ENCODING_HT); + hashTypeConvert(o, OBJ_ENCODING_HT, NULL); else if (deep_integrity_validation) { /* In this mode, we need to guarantee that the server won't crash * later when the ziplist is converted to a dict. @@ -2060,48 +2135,50 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { dupSearchDict = dictCreate(&hashDictType); } - - /* Load every field and value into the ziplist */ + /* Load every field and value into the listpack */ while (o->encoding == OBJ_ENCODING_LISTPACK && len > 0) { len--; /* Load raw strings */ - if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) { + if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_HFLD,NULL)) == NULL) { decrRefCount(o); if (dupSearchDict) dictRelease(dupSearchDict); return NULL; } if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) { - sdsfree(field); + hfieldFree(field); decrRefCount(o); if (dupSearchDict) dictRelease(dupSearchDict); return NULL; } if (dupSearchDict) { - sds field_dup = sdsdup(field); + sds field_dup = sdsnewlen(field, hfieldlen(field)); + if (dictAdd(dupSearchDict, field_dup, NULL) != DICT_OK) { rdbReportCorruptRDB("Hash with dup elements"); dictRelease(dupSearchDict); decrRefCount(o); sdsfree(field_dup); - sdsfree(field); + hfieldFree(field); sdsfree(value); return NULL; } } /* Convert to hash table if size threshold is exceeded */ - if (sdslen(field) > server.hash_max_listpack_value || + if (hfieldlen(field) > server.hash_max_listpack_value || sdslen(value) > server.hash_max_listpack_value || - !lpSafeToAdd(o->ptr, sdslen(field)+sdslen(value))) + !lpSafeToAdd(o->ptr, hfieldlen(field) + sdslen(value))) { - hashTypeConvert(o, OBJ_ENCODING_HT); + hashTypeConvert(o, OBJ_ENCODING_HT, NULL); + dictUseStoredKeyApi((dict *)o->ptr, 1); ret = dictAdd((dict*)o->ptr, field, value); + dictUseStoredKeyApi((dict *)o->ptr, 0); if (ret == DICT_ERR) { rdbReportCorruptRDB("Duplicate hash fields detected"); if (dupSearchDict) dictRelease(dupSearchDict); sdsfree(value); - sdsfree(field); + hfieldFree(field); decrRefCount(o); return NULL; } @@ -2109,10 +2186,10 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } /* Add pair to listpack */ - o->ptr = lpAppend(o->ptr, (unsigned char*)field, sdslen(field)); + o->ptr = lpAppend(o->ptr, (unsigned char*)field, hfieldlen(field)); o->ptr = lpAppend(o->ptr, (unsigned char*)value, sdslen(value)); - sdsfree(field); + hfieldFree(field); sdsfree(value); } @@ -2124,7 +2201,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } if (o->encoding == OBJ_ENCODING_HT && len > DICT_HT_INITIAL_SIZE) { - if (dictTryExpand(o->ptr,len) != DICT_OK) { + if (dictTryExpand(o->ptr, len) != DICT_OK) { rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); decrRefCount(o); return NULL; @@ -2135,22 +2212,25 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { while (o->encoding == OBJ_ENCODING_HT && len > 0) { len--; /* Load encoded strings */ - if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) { + if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_HFLD,NULL)) == NULL) { decrRefCount(o); return NULL; } if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) { - sdsfree(field); + hfieldFree(field); decrRefCount(o); return NULL; } /* Add pair to hash table */ - ret = dictAdd((dict*)o->ptr, field, value); + dict *d = o->ptr; + dictUseStoredKeyApi(d, 1); + ret = dictAdd(d, field, value); + dictUseStoredKeyApi(d, 0); if (ret == DICT_ERR) { rdbReportCorruptRDB("Duplicate hash fields detected"); sdsfree(value); - sdsfree(field); + hfieldFree(field); decrRefCount(o); return NULL; } @@ -2158,13 +2238,146 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* All pairs should be read by now */ serverAssert(len == 0); + } else if (rdbtype == RDB_TYPE_HASH_METADATA) { + sds value; + hfield field; + uint64_t expireAt; + dict *dupSearchDict = NULL; + + len = rdbLoadLen(rdb, NULL); + if (len == RDB_LENERR) return NULL; + if (len == 0) goto emptykey; + /* TODO: create listpackEx or HT directly*/ + o = createHashObject(); + /* Too many entries? Use a hash table right from the start. */ + if (len > server.hash_max_listpack_entries) { + hashTypeConvert(o, OBJ_ENCODING_HT, NULL); + dictTypeAddMeta((dict**)&o->ptr, &mstrHashDictTypeWithHFE); + initDictExpireMetadata(key, o); + } else { + hashTypeConvert(o, OBJ_ENCODING_LISTPACK_EX, NULL); + if (deep_integrity_validation) { + /* In this mode, we need to guarantee that the server won't crash + * later when the listpack is converted to a dict. + * Create a set (dict with no values) for dup search. + * We can dismiss it as soon as we convert the listpack to a hash. */ + dupSearchDict = dictCreate(&hashDictType); + } + } + + while (len > 0) { + len--; + + /* read the TTL */ + if (rdbLoadLenByRef(rdb, NULL, &expireAt) == -1) { + serverLog(LL_WARNING, "failed reading hash TTL"); + decrRefCount(o); + if (dupSearchDict != NULL) dictRelease(dupSearchDict); + return NULL; + } + if (expireAt > EB_EXPIRE_TIME_MAX) { + rdbReportCorruptRDB("invalid expireAt time: %llu", (unsigned long long)expireAt); + decrRefCount(o); + return NULL; + } + + /* if needed create field with TTL metadata */ + if (expireAt !=0) + field = rdbGenericLoadStringObject(rdb, RDB_LOAD_HFLD_TTL, NULL); + else + field = rdbGenericLoadStringObject(rdb, RDB_LOAD_HFLD, NULL); + + if (field == NULL) { + serverLog(LL_WARNING, "failed reading hash field"); + decrRefCount(o); + if (dupSearchDict != NULL) dictRelease(dupSearchDict); + return NULL; + } + + /* read the value */ + if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) { + serverLog(LL_WARNING, "failed reading hash value"); + decrRefCount(o); + if (dupSearchDict != NULL) dictRelease(dupSearchDict); + hfieldFree(field); + return NULL; + } + + /* store the values read - either to listpack or dict */ + if (o->encoding == OBJ_ENCODING_LISTPACK_EX) { + /* integrity - check for key duplication (if required) */ + if (dupSearchDict) { + sds field_dup = sdsnewlen(field, hfieldlen(field)); + + if (dictAdd(dupSearchDict, field_dup, NULL) != DICT_OK) { + rdbReportCorruptRDB("Hash with dup elements"); + dictRelease(dupSearchDict); + decrRefCount(o); + sdsfree(field_dup); + sdsfree(value); + hfieldFree(field); + return NULL; + } + } + + /* check if the values can be saved to listpack (or should convert to dict encoding) */ + if (hfieldlen(field) > server.hash_max_listpack_value || + sdslen(value) > server.hash_max_listpack_value || + !lpSafeToAdd(((listpackEx*)o->ptr)->lp, hfieldlen(field) + sdslen(value) + lpEntrySizeInteger(expireAt))) + { + /* convert to hash */ + hashTypeConvert(o, OBJ_ENCODING_HT, NULL); + + if (len > DICT_HT_INITIAL_SIZE) { /* TODO: this is NOT the original len, but this is also the case for simple hash, is this a bug? */ + if (dictTryExpand(o->ptr, len) != DICT_OK) { + rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); + decrRefCount(o); + if (dupSearchDict != NULL) dictRelease(dupSearchDict); + sdsfree(value); + hfieldFree(field); + return NULL; + } + } + + /* don't add the values to the new hash: the next if will catch and the values will be added there */ + } else { + listpackExAddNew(o, field, hfieldlen(field), + value, sdslen(value), expireAt); + hfieldFree(field); + sdsfree(value); + } + } + + if (o->encoding == OBJ_ENCODING_HT) { + /* Add pair to hash table */ + dict *d = o->ptr; + dictUseStoredKeyApi(d, 1); + int ret = dictAdd(d, field, value); + dictUseStoredKeyApi(d, 0); + + /* Attach expiry to the hash field and register in hash private HFE DS */ + if ((ret != DICT_ERR) && expireAt) { + dictExpireMetadata *m = (dictExpireMetadata *) dictMetadata(d); + ret = ebAdd(&m->hfe, &hashFieldExpireBucketsType, field, expireAt); + } + + if (ret == DICT_ERR) { + rdbReportCorruptRDB("Duplicate hash fields detected"); + sdsfree(value); + hfieldFree(field); + decrRefCount(o); + return NULL; + } + } + } + + if (dupSearchDict != NULL) dictRelease(dupSearchDict); + } else if (rdbtype == RDB_TYPE_LIST_QUICKLIST || rdbtype == RDB_TYPE_LIST_QUICKLIST_2) { if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL; if (len == 0) goto emptykey; - o = createQuicklistObject(); - quicklistSetOptions(o->ptr, server.list_max_listpack_size, - server.list_compress_depth); + o = createQuicklistObject(server.list_max_listpack_size, server.list_compress_depth); uint64_t container = QUICKLIST_NODE_CONTAINER_PACKED; while (len--) { unsigned char *lp; @@ -2234,7 +2447,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { goto emptykey; } - listTypeTryConversion(o,LIST_CONV_AUTO,NULL,NULL); + listTypeTryConversion(o, LIST_CONV_AUTO, NULL, NULL); } else if (rdbtype == RDB_TYPE_HASH_ZIPMAP || rdbtype == RDB_TYPE_LIST_ZIPLIST || rdbtype == RDB_TYPE_SET_INTSET || @@ -2242,14 +2455,15 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { rdbtype == RDB_TYPE_ZSET_ZIPLIST || rdbtype == RDB_TYPE_ZSET_LISTPACK || rdbtype == RDB_TYPE_HASH_ZIPLIST || - rdbtype == RDB_TYPE_HASH_LISTPACK) + rdbtype == RDB_TYPE_HASH_LISTPACK || + rdbtype == RDB_TYPE_HASH_LISTPACK_EX) { size_t encoded_len; unsigned char *encoded = rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,&encoded_len); if (encoded == NULL) return NULL; - o = createObject(OBJ_STRING,encoded); /* Obj type fixed below. */ + o = createObject(OBJ_STRING, encoded); /* Obj type fixed below. */ /* Fix the object encoding, and make sure to convert the encoded * data type into the base type if accordingly to the current @@ -2305,14 +2519,14 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o->type = OBJ_HASH; o->encoding = OBJ_ENCODING_LISTPACK; - if (hashTypeLength(o) > server.hash_max_listpack_entries || + if (hashTypeLength(o, 0) > server.hash_max_listpack_entries || maxlen > server.hash_max_listpack_value) { - hashTypeConvert(o, OBJ_ENCODING_HT); + hashTypeConvert(o, OBJ_ENCODING_HT, NULL); } } break; - case RDB_TYPE_LIST_ZIPLIST: + case RDB_TYPE_LIST_ZIPLIST: { quicklist *ql = quicklistNew(server.list_max_listpack_size, server.list_compress_depth); @@ -2354,11 +2568,11 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o->type = OBJ_SET; o->encoding = OBJ_ENCODING_INTSET; if (intsetLen(o->ptr) > server.set_max_intset_entries) - setTypeConvert(o,OBJ_ENCODING_HT); + setTypeConvert(o, OBJ_ENCODING_HT); break; case RDB_TYPE_SET_LISTPACK: if (deep_integrity_validation) server.stat_dump_payload_sanitizations++; - if (!lpValidateIntegrityAndDups(encoded, encoded_len, deep_integrity_validation, 0)) { + if (!lpValidateIntegrityAndDups(encoded, encoded_len, deep_integrity_validation, 1)) { rdbReportCorruptRDB("Set listpack integrity check failed."); zfree(encoded); o->ptr = NULL; @@ -2399,14 +2613,14 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } if (zsetLength(o) > server.zset_max_listpack_entries) - zsetConvert(o,OBJ_ENCODING_SKIPLIST); + zsetConvert(o, OBJ_ENCODING_SKIPLIST); else o->ptr = lpShrinkToFit(o->ptr); break; } case RDB_TYPE_ZSET_LISTPACK: if (deep_integrity_validation) server.stat_dump_payload_sanitizations++; - if (!lpValidateIntegrityAndDups(encoded, encoded_len, deep_integrity_validation, 1)) { + if (!lpValidateIntegrityAndDups(encoded, encoded_len, deep_integrity_validation, 2)) { rdbReportCorruptRDB("Zset listpack integrity check failed."); zfree(encoded); o->ptr = NULL; @@ -2421,7 +2635,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } if (zsetLength(o) > server.zset_max_listpack_entries) - zsetConvert(o,OBJ_ENCODING_SKIPLIST); + zsetConvert(o, OBJ_ENCODING_SKIPLIST); break; case RDB_TYPE_HASH_ZIPLIST: { @@ -2439,35 +2653,54 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o->ptr = lp; o->type = OBJ_HASH; o->encoding = OBJ_ENCODING_LISTPACK; - if (hashTypeLength(o) == 0) { + if (hashTypeLength(o, 0) == 0) { decrRefCount(o); goto emptykey; } - if (hashTypeLength(o) > server.hash_max_listpack_entries) - hashTypeConvert(o, OBJ_ENCODING_HT); + if (hashTypeLength(o, 0) > server.hash_max_listpack_entries) + hashTypeConvert(o, OBJ_ENCODING_HT, NULL); else o->ptr = lpShrinkToFit(o->ptr); break; } case RDB_TYPE_HASH_LISTPACK: + case RDB_TYPE_HASH_LISTPACK_EX: + /* listpack-encoded hash with TTL requires its own struct + * pointed to by o->ptr */ + o->type = OBJ_HASH; + if (rdbtype == RDB_TYPE_HASH_LISTPACK_EX) { + listpackEx *lpt = listpackExCreate(); + lpt->lp = encoded; + lpt->key = key; + o->ptr = lpt; + o->encoding = OBJ_ENCODING_LISTPACK_EX; + } else + o->encoding = OBJ_ENCODING_LISTPACK; + + /* tuple_len is the number of elements for each key: + * key + value for simple hash, key + value + tll for hash with TTL*/ + int tuple_len = (rdbtype == RDB_TYPE_HASH_LISTPACK ? 2 : 3); + /* validate read data */ if (deep_integrity_validation) server.stat_dump_payload_sanitizations++; - if (!lpValidateIntegrityAndDups(encoded, encoded_len, deep_integrity_validation, 1)) { + if (!lpValidateIntegrityAndDups(encoded, encoded_len, + deep_integrity_validation, tuple_len)) { rdbReportCorruptRDB("Hash listpack integrity check failed."); - zfree(encoded); - o->ptr = NULL; decrRefCount(o); return NULL; } - o->type = OBJ_HASH; - o->encoding = OBJ_ENCODING_LISTPACK; - if (hashTypeLength(o) == 0) { + + /* if listpack is empty, delete it */ + if (hashTypeLength(o, 0) == 0) { decrRefCount(o); goto emptykey; } - if (hashTypeLength(o) > server.hash_max_listpack_entries) - hashTypeConvert(o, OBJ_ENCODING_HT); + /* Convert listpack to hash table without registering in global HFE DS, + * if has HFEs, since the listpack is not connected yet to the DB */ + if (hashTypeLength(o, 0) > server.hash_max_listpack_entries) + hashTypeConvert(o, OBJ_ENCODING_HT, NULL /*db->hexpires*/); + break; default: /* totally unreachable */ @@ -2553,7 +2786,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* Load the last entry ID. */ s->last_id.ms = rdbLoadLen(rdb,NULL); s->last_id.seq = rdbLoadLen(rdb,NULL); - + if (rdbtype >= RDB_TYPE_STREAM_LISTPACKS_2) { /* Load the first entry ID. */ s->first_id.ms = rdbLoadLen(rdb,NULL); @@ -2572,9 +2805,9 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { s->max_deleted_entry_id.ms = 0; s->max_deleted_entry_id.seq = 0; s->entries_added = s->length; - + /* Since the rax is already loaded, we can find the first entry's - * ID. */ + * ID. */ streamGetEdgeID(s,1,1,&s->first_id); } @@ -2740,13 +2973,14 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { decrRefCount(o); return NULL; } - streamNACK *nack = raxFind(cgroup->pel,rawid,sizeof(rawid)); - if (nack == raxNotFound) { + void *result; + if (!raxFind(cgroup->pel,rawid,sizeof(rawid),&result)) { rdbReportCorruptRDB("Consumer entry not found in " "group global PEL"); decrRefCount(o); return NULL; } + streamNACK *nack = result; /* Set the NACK consumer, that was left to NULL when * loading the global PEL. Then set the same shared @@ -2819,7 +3053,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { uint64_t eof = rdbLoadLen(rdb,NULL); if (eof == RDB_LENERR) { if (ptr) { - o = createModuleObject(mt,ptr); /* creating just in order to easily destroy */ + o = createModuleObject(mt, ptr); /* creating just in order to easily destroy */ decrRefCount(o); } return NULL; @@ -2828,7 +3062,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { rdbReportCorruptRDB("The RDB file contains module data for the module '%s' that is not terminated by " "the proper module value EOF marker", moduleTypeModuleName(mt)); if (ptr) { - o = createModuleObject(mt,ptr); /* creating just in order to easily destroy */ + o = createModuleObject(mt, ptr); /* creating just in order to easily destroy */ decrRefCount(o); } return NULL; @@ -2840,11 +3074,12 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { moduleTypeModuleName(mt)); return NULL; } - o = createModuleObject(mt,ptr); + o = createModuleObject(mt, ptr); } else { rdbReportReadError("Unknown RDB encoding type %d",rdbtype); return NULL; } + if (error) *error = 0; return o; @@ -2981,7 +3216,7 @@ int rdbFunctionLoad(rio *rdb, int ver, functionsLibCtx* lib_ctx, int rdbflags, s if (lib_ctx) { sds library_name = NULL; - if (!(library_name = functionsCreateWithLibraryCtx(final_payload, rdbflags & RDBFLAGS_ALLOW_DUP, &error, lib_ctx))) { + if (!(library_name = functionsCreateWithLibraryCtx(final_payload, rdbflags & RDBFLAGS_ALLOW_DUP, &error, lib_ctx, 0))) { if (!error) { error = sdsnew("Failed creating the library"); } @@ -3014,7 +3249,6 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { return retval; } - /* Load an RDB file from the rio stream 'rdb'. On success C_OK is returned, * otherwise C_ERR is returned. * The rdb_loading_ctx argument holds objects to which the rdb will be loaded to, @@ -3023,6 +3257,8 @@ int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) { int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx) { uint64_t dbid = 0; int type, rdbver; + uint64_t db_size = 0, expires_size = 0; + int should_expand_db = 0; redisDb *db = rdb_loading_ctx->dbarray+0; char buf[1024]; int error; @@ -3098,13 +3334,27 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin } else if (type == RDB_OPCODE_RESIZEDB) { /* RESIZEDB: Hint about the size of the keys in the currently * selected data base, in order to avoid useless rehashing. */ - uint64_t db_size, expires_size; if ((db_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; if ((expires_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; - dictExpand(db->dict,db_size); - dictExpand(db->expires,expires_size); + should_expand_db = 1; + continue; /* Read next opcode. */ + } else if (type == RDB_OPCODE_SLOT_INFO) { + uint64_t slot_id, slot_size, expires_slot_size; + if ((slot_id = rdbLoadLen(rdb,NULL)) == RDB_LENERR) + goto eoferr; + if ((slot_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR) + goto eoferr; + if ((expires_slot_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR) + goto eoferr; + if (!server.cluster_enabled) { + continue; /* Ignore gracefully. */ + } + /* In cluster mode we resize individual slot specific dictionaries based on the number of keys that slot holds. */ + kvstoreDictExpand(db->keys, slot_id, slot_size); + kvstoreDictExpand(db->expires, slot_id, expires_slot_size); + should_expand_db = 0; continue; /* Read next opcode. */ } else if (type == RDB_OPCODE_AUX) { /* AUX: generic string-string fields. Use to add state to RDB @@ -3234,6 +3484,14 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin continue; } + /* If there is no slot info, it means that it's either not cluster mode or we are trying to load legacy RDB file. + * In this case we want to estimate number of keys per slot and resize accordingly. */ + if (should_expand_db) { + dbExpand(db, db_size, 0); + dbExpandExpires(db, expires_size, 0); + should_expand_db = 0; + } + /* Read key */ if ((key = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) goto eoferr; @@ -3245,8 +3503,8 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin * received from the master. In the latter case, the master is * responsible for key expiry. If we would expire keys here, the * snapshot taken by the master may not be reflected on the slave. - * Similarly, if the base AOF is RDB format, we want to load all - * the keys they are, since the log of operations in the incr AOF + * Similarly, if the base AOF is RDB format, we want to load all + * the keys they are, since the log of operations in the incr AOF * is assumed to work in the exact keyspace state. */ if (val == NULL) { /* Since we used to have bug that could lead to empty keys @@ -3301,6 +3559,14 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin } } + /* If minExpiredField was set, then the object is hash with expiration + * on fields and need to register it in global HFE DS */ + if (val->type == OBJ_HASH) { + uint64_t minExpiredField = hashTypeGetMinExpire(val, 1); + if (minExpiredField != EB_EXPIRE_TIME_INVALID) + hashTypeAddToExpires(db, key, val, minExpiredField); + } + /* Set the expire time if needed */ if (expiretime != -1) { setExpire(NULL,db,&keyobj,expiretime); @@ -3402,19 +3668,19 @@ int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags) { if (retval == C_OK && !(rdbflags & RDBFLAGS_KEEP_CACHE)) { /* TODO: maybe we could combine the fopen and open into one in the future */ rdb_fd = open(filename, O_RDONLY); - if (rdb_fd > 0) bioCreateCloseJob(rdb_fd, 0, 1); + if (rdb_fd >= 0) bioCreateCloseJob(rdb_fd, 0, 1); } return (retval==C_OK) ? RDB_OK : RDB_FAILED; } /* A background saving child (BGSAVE) terminated its work. Handle this. * This function covers the case of actual BGSAVEs. */ -static void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) { +static void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal, time_t save_end) { if (!bysignal && exitcode == 0) { serverLog(LL_NOTICE, "Background saving terminated with success"); server.dirty = server.dirty - server.dirty_before_bgsave; - server.lastsave = time(NULL); + server.lastsave = save_end; server.lastbgsave_status = C_OK; } else if (!bysignal && exitcode != 0) { serverLog(LL_WARNING, "Background saving error"); @@ -3466,9 +3732,11 @@ static void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) { /* When a background RDB saving/transfer terminates, call the right handler. */ void backgroundSaveDoneHandler(int exitcode, int bysignal) { int type = server.rdb_child_type; + time_t save_end = time(NULL); + switch(server.rdb_child_type) { case RDB_CHILD_TYPE_DISK: - backgroundSaveDoneHandlerDisk(exitcode,bysignal); + backgroundSaveDoneHandlerDisk(exitcode,bysignal,save_end); break; case RDB_CHILD_TYPE_SOCKET: backgroundSaveDoneHandlerSocket(exitcode,bysignal); @@ -3479,7 +3747,7 @@ void backgroundSaveDoneHandler(int exitcode, int bysignal) { } server.rdb_child_type = RDB_CHILD_TYPE_NONE; - server.rdb_save_time_last = time(NULL)-server.rdb_save_time_start; + server.rdb_save_time_last = save_end-server.rdb_save_time_start; server.rdb_save_time_start = -1; /* Possibly there are slaves waiting for a BGSAVE in order to be served * (the first stage of SYNC is a bulk transfer of dump.rdb) */ @@ -3598,6 +3866,7 @@ int rdbSaveToSlavesSockets(int req, rdbSaveInfo *rsi) { } close(rdb_pipe_write); close(server.rdb_pipe_read); + close(server.rdb_child_exit_pipe); zfree(server.rdb_pipe_conns); server.rdb_pipe_conns = NULL; server.rdb_pipe_numconns = 0; diff --git a/src/rdb.h b/src/rdb.h index 234bde221be..65da1932239 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __RDB_H @@ -38,7 +17,7 @@ /* The current RDB version. When the format changes in a way that is no longer * backward compatible this number gets incremented. */ -#define RDB_VERSION 11 +#define RDB_VERSION 12 /* Defines related to the dump file format. To store 32 bits lengths for short * keys requires a lot of space, so we check the most significant 2 bits of @@ -81,9 +60,6 @@ #define RDB_TYPE_MODULE_PRE_GA 6 /* Used in 4.0 release candidates */ #define RDB_TYPE_MODULE_2 7 /* Module value with annotations for parsing without the generating module being loaded. */ -/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */ - -/* Object types for encoded objects. */ #define RDB_TYPE_HASH_ZIPMAP 9 #define RDB_TYPE_LIST_ZIPLIST 10 #define RDB_TYPE_SET_INTSET 11 @@ -97,12 +73,15 @@ #define RDB_TYPE_STREAM_LISTPACKS_2 19 #define RDB_TYPE_SET_LISTPACK 20 #define RDB_TYPE_STREAM_LISTPACKS_3 21 -/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */ +#define RDB_TYPE_HASH_METADATA 22 +#define RDB_TYPE_HASH_LISTPACK_EX 23 +/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType(), and rdb_type_string[] */ /* Test if a type is an object type. */ -#define rdbIsObjectType(t) (((t) >= 0 && (t) <= 7) || ((t) >= 9 && (t) <= 21)) +#define rdbIsObjectType(t) (((t) >= 0 && (t) <= 7) || ((t) >= 9 && (t) <= 23)) /* Special RDB opcodes (saved/loaded with rdbSaveType/rdbLoadType). */ +#define RDB_OPCODE_SLOT_INFO 244 /* Individual slot info, such as slot id and size (cluster mode only). */ #define RDB_OPCODE_FUNCTION2 245 /* function library data */ #define RDB_OPCODE_FUNCTION_PRE_GA 246 /* old function library data for 7.0 rc1 and rc2 */ #define RDB_OPCODE_MODULE_AUX 247 /* Module auxiliary data. */ @@ -124,13 +103,15 @@ #define RDB_MODULE_OPCODE_STRING 5 /* String. */ /* rdbLoad...() functions flags. */ -#define RDB_LOAD_NONE 0 -#define RDB_LOAD_ENC (1<<0) -#define RDB_LOAD_PLAIN (1<<1) -#define RDB_LOAD_SDS (1<<2) +#define RDB_LOAD_NONE 0 +#define RDB_LOAD_ENC (1<<0) +#define RDB_LOAD_PLAIN (1<<1) +#define RDB_LOAD_SDS (1<<2) +#define RDB_LOAD_HFLD (1<<3) +#define RDB_LOAD_HFLD_TTL (1<<4) /* flags on the purpose of rdb save or load */ -#define RDBFLAGS_NONE 0 /* No special RDB loading. */ +#define RDBFLAGS_NONE 0 /* No special RDB loading or saving. */ #define RDBFLAGS_AOF_PREAMBLE (1<<0) /* Load/save the RDB as AOF preamble. */ #define RDBFLAGS_REPLICATION (1<<1) /* Load/save for SYNC. */ #define RDBFLAGS_ALLOW_DUP (1<<2) /* Allow duplicated keys when loading.*/ @@ -139,15 +120,15 @@ /* When rdbLoadObject() returns NULL, the err flag is * set to hold the type of error that occurred */ -#define RDB_LOAD_ERR_EMPTY_KEY 1 /* Error of empty key */ -#define RDB_LOAD_ERR_OTHER 2 /* Any other errors */ +#define RDB_LOAD_ERR_EMPTY_KEY 1 /* Error of empty key */ +#define RDB_LOAD_ERR_OTHER 2 /* Any other errors */ ssize_t rdbWriteRaw(rio *rdb, void *p, size_t len); int rdbSaveType(rio *rdb, unsigned char type); int rdbLoadType(rio *rdb); time_t rdbLoadTime(rio *rdb); int rdbSaveLen(rio *rdb, uint64_t len); -int rdbSaveMillisecondTime(rio *rdb, long long t); +ssize_t rdbSaveMillisecondTime(rio *rdb, long long t); long long rdbLoadMillisecondTime(rio *rdb, int rdbver); uint64_t rdbLoadLen(rio *rdb, int *isencoded); int rdbLoadLenByRef(rio *rdb, int *isencoded, uint64_t *lenptr); diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index 1ee562f20ca..94151ba8af9 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -1,35 +1,13 @@ /* Redis benchmark utility. * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "fmacros.h" -#include "version.h" #include #include @@ -167,7 +145,6 @@ typedef struct clusterNode { sds replicate; /* Master ID if node is a slave */ int *slots; int slots_count; - int current_slot_index; int *updated_slots; /* Used by updateClusterSlotsConfiguration */ int updated_slots_count; /* Used by updateClusterSlotsConfiguration */ int replicas_count; @@ -186,8 +163,6 @@ typedef struct redisConfig { } redisConfig; /* Prototypes */ -char *redisGitSHA1(void); -char *redisGitDirty(void); static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask); static void createMissingClients(client c); static benchmarkThread *createBenchmarkThread(int index); @@ -205,20 +180,6 @@ static void updateClusterSlotsConfiguration(void); int showThroughput(struct aeEventLoop *eventLoop, long long id, void *clientData); -static sds benchmarkVersion(void) { - sds version; - version = sdscatprintf(sdsempty(), "%s", REDIS_VERSION); - - /* Add git commit and working tree status when available */ - if (strtoll(redisGitSHA1(),NULL,16)) { - version = sdscatprintf(version, " (git:%s", redisGitSHA1()); - if (strtoll(redisGitDirty(),NULL,10)) - version = sdscatprintf(version, "-dirty"); - version = sdscat(version, ")"); - } - return version; -} - /* Dict callbacks */ static uint64_t dictSdsHash(const void *key); static int dictSdsKeyCompare(dict *d, const void *key1, const void *key2); @@ -434,7 +395,6 @@ static void setClusterKeyHashTag(client c) { assert(c->thread_id >= 0); clusterNode *node = c->cluster_node; assert(node); - assert(node->current_slot_index < node->slots_count); int is_updating_slots = 0; atomicGet(config.is_updating_slots, is_updating_slots); /* If updateClusterSlotsConfiguration is updating the slots array, @@ -444,7 +404,7 @@ static void setClusterKeyHashTag(client c) { * updateClusterSlotsConfiguration won't actually do anything, since * the updated_slots_count array will be already NULL. */ if (is_updating_slots) updateClusterSlotsConfiguration(); - int slot = node->slots[node->current_slot_index]; + int slot = node->slots[rand() % node->slots_count]; const char *tag = crc16_slot_table[slot]; int taglen = strlen(tag); size_t i; @@ -1064,7 +1024,6 @@ static clusterNode *createClusterNode(char *ip, int port) { node->replicas_count = 0; node->slots = zmalloc(CLUSTER_SLOTS * sizeof(int)); node->slots_count = 0; - node->current_slot_index = 0; node->updated_slots = NULL; node->updated_slots_count = 0; node->migrating = NULL; @@ -1387,7 +1346,6 @@ static void updateClusterSlotsConfiguration(void) { int *oldslots = node->slots; node->slots = node->updated_slots; node->slots_count = node->updated_slots_count; - node->current_slot_index = 0; node->updated_slots = NULL; node->updated_slots_count = 0; zfree(oldslots); @@ -1423,7 +1381,7 @@ int parseOptions(int argc, char **argv) { if (lastarg) goto invalid; config.numclients = atoi(argv[++i]); } else if (!strcmp(argv[i],"-v") || !strcmp(argv[i], "--version")) { - sds version = benchmarkVersion(); + sds version = cliVersion(); printf("redis-benchmark %s\n", version); sdsfree(version); exit(0); @@ -1613,7 +1571,10 @@ int parseOptions(int argc, char **argv) { " -s Server socket (overrides host and port)\n" " -a Password for Redis Auth\n" " --user Used to send ACL style 'AUTH username pass'. Needs -a.\n" -" -u Server URI.\n" +" -u Server URI on format redis://user:password@host:port/dbnum\n" +" User, password and dbnum are optional. For authentication\n" +" without a username, use username 'default'. For TLS, use\n" +" the scheme 'rediss'.\n" " -c Number of parallel connections (default 50).\n" " Note: If --cluster is used then number of clients has to be\n" " the same or higher than the number of nodes.\n" @@ -1888,8 +1849,12 @@ int main(int argc, char **argv) { sds_args[argc] = readArgFromStdin(); argc++; } + /* Setup argument length */ + size_t *argvlen = zmalloc(argc*sizeof(size_t)); + for (i = 0; i < argc; i++) + argvlen[i] = sdslen(sds_args[i]); do { - len = redisFormatCommandArgv(&cmd,argc,(const char**)sds_args,NULL); + len = redisFormatCommandArgv(&cmd,argc,(const char**)sds_args,argvlen); // adjust the datasize to the parsed command config.datasize = len; benchmark(title,cmd,len); @@ -1899,6 +1864,7 @@ int main(int argc, char **argv) { sdsfree(title); if (config.redis_config != NULL) freeRedisConfig(config.redis_config); + zfree(argvlen); return 0; } diff --git a/src/redis-check-aof.c b/src/redis-check-aof.c index 616177a8b7a..56298387e26 100644 --- a/src/redis-check-aof.c +++ b/src/redis-check-aof.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2009-2012, Pieter Noordhuis - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-current, Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,6 +29,7 @@ */ #include "server.h" + #include #include #include @@ -233,11 +234,13 @@ int checkSingleAof(char *aof_filename, char *aof_filepath, int last_file, int fi struct redis_stat sb; if (redis_fstat(fileno(fp),&sb) == -1) { printf("Cannot stat file: %s, aborting...\n", aof_filename); + fclose(fp); exit(1); } off_t size = sb.st_size; if (size == 0) { + fclose(fp); return AOF_CHECK_EMPTY; } @@ -343,6 +346,7 @@ int fileIsRDB(char *filepath) { struct redis_stat sb; if (redis_fstat(fileno(fp), &sb) == -1) { printf("Cannot stat file: %s\n", filepath); + fclose(fp); exit(1); } @@ -379,6 +383,7 @@ int fileIsManifest(char *filepath) { struct redis_stat sb; if (redis_fstat(fileno(fp), &sb) == -1) { printf("Cannot stat file: %s\n", filepath); + fclose(fp); exit(1); } @@ -395,15 +400,20 @@ int fileIsManifest(char *filepath) { break; } else { printf("Cannot read file: %s\n", filepath); + fclose(fp); exit(1); } } - /* Skip comments lines */ + /* We will skip comments lines. + * At present, the manifest format is fixed, see aofInfoFormat. + * We will break directly as long as it encounters other items. */ if (buf[0] == '#') { continue; } else if (!memcmp(buf, "file", strlen("file"))) { is_manifest = 1; + } else { + break; } } @@ -514,6 +524,13 @@ int redis_check_aof_main(int argc, char **argv) { if (argc < 2) { goto invalid_args; } else if (argc == 2) { + if (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version")) { + sds version = getVersion(); + printf("redis-check-aof %s\n", version); + sdsfree(version); + exit(0); + } + filepath = argv[1]; } else if (argc == 3) { if (!strcmp(argv[1], "--fix")) { diff --git a/src/redis-check-rdb.c b/src/redis-check-rdb.c index 682135e55f2..090c1bd4433 100644 --- a/src/redis-check-rdb.c +++ b/src/redis-check-rdb.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2016, Salvatore Sanfilippo + * Copyright (c) 2016-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "mt19937-64.h" @@ -98,7 +77,11 @@ char *rdb_type_string[] = { "hash-listpack", "zset-listpack", "quicklist-v2", + "stream-v2", "set-listpack", + "stream-v3", + "hash-hashtable-md", + "hash-listpack-md", }; /* Show a few stats collected into 'rdbstate' */ @@ -276,6 +259,15 @@ int redis_check_rdb(char *rdbfilename, FILE *fp) { if ((expires_size = rdbLoadLen(&rdb,NULL)) == RDB_LENERR) goto eoferr; continue; /* Read type again. */ + } else if (type == RDB_OPCODE_SLOT_INFO) { + uint64_t slot_id, slot_size, expires_slot_size; + if ((slot_id = rdbLoadLen(&rdb,NULL)) == RDB_LENERR) + goto eoferr; + if ((slot_size = rdbLoadLen(&rdb,NULL)) == RDB_LENERR) + goto eoferr; + if ((expires_slot_size = rdbLoadLen(&rdb,NULL)) == RDB_LENERR) + goto eoferr; + continue; /* Read type again. */ } else if (type == RDB_OPCODE_AUX) { /* AUX: generic string-string fields. Use to add state to RDB * which is backward compatible. Implementations of RDB loading @@ -341,7 +333,8 @@ int redis_check_rdb(char *rdbfilename, FILE *fp) { rdbstate.keys++; /* Read value */ rdbstate.doing = RDB_CHECK_DOING_READ_OBJECT_VALUE; - if ((val = rdbLoadObject(type,&rdb,key->ptr,selected_dbid,NULL)) == NULL) goto eoferr; + if ((val = rdbLoadObject(type,&rdb,key->ptr,selected_dbid,NULL)) == NULL) + goto eoferr; /* Check if the key already expired. */ if (expiretime != -1 && expiretime < now) rdbstate.already_expired++; @@ -385,20 +378,6 @@ int redis_check_rdb(char *rdbfilename, FILE *fp) { return 1; } -static sds checkRdbVersion(void) { - sds version; - version = sdscatprintf(sdsempty(), "%s", REDIS_VERSION); - - /* Add git commit and working tree status when available */ - if (strtoll(redisGitSHA1(),NULL,16)) { - version = sdscatprintf(version, " (git:%s", redisGitSHA1()); - if (strtoll(redisGitDirty(),NULL,10)) - version = sdscatprintf(version, "-dirty"); - version = sdscat(version, ")"); - } - return version; -} - /* RDB check main: called form server.c when Redis is executed with the * redis-check-rdb alias, on during RDB loading errors. * @@ -418,7 +397,7 @@ int redis_check_rdb_main(int argc, char **argv, FILE *fp) { fprintf(stderr, "Usage: %s \n", argv[0]); exit(1); } else if (!strcmp(argv[1],"-v") || !strcmp(argv[1], "--version")) { - sds version = checkRdbVersion(); + sds version = getVersion(); printf("redis-check-rdb %s\n", version); sdsfree(version); exit(0); diff --git a/src/redis-cli.c b/src/redis-cli.c index de34965b48e..2590fb182ba 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -1,36 +1,15 @@ /* Redis CLI (command line interface) * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "fmacros.h" -#include "version.h" +#include #include #include #include @@ -64,8 +43,8 @@ #include "connection.h" #include "cli_common.h" #include "mt19937-64.h" - #include "cli_commands.h" +#include "hdr_histogram.h" #define UNUSED(V) ((void) V) @@ -163,6 +142,10 @@ /* DNS lookup */ #define NET_IP_STR_LEN 46 /* INET6_ADDRSTRLEN is 46 */ +#define REFRESH_INTERVAL 300 /* milliseconds */ + +#define IS_TTY_OR_FAKETTY() (isatty(STDOUT_FILENO) || getenv("FAKETTY")) + /* --latency-dist palettes. */ int spectrum_palette_color_size = 19; int spectrum_palette_color[] = {0,233,234,235,237,239,241,243,245,247,144,143,142,184,226,214,208,202,196}; @@ -213,6 +196,7 @@ static int createClusterManagerCommand(char *cmdname, int argc, char **argv); static redisContext *context; static struct config { cliConnInfo conn_info; + struct timeval connect_timeout; char *hostsocket; int tls; cliSSLconfig sslconfig; @@ -246,8 +230,11 @@ static struct config { char *rdb_filename; int bigkeys; int memkeys; - unsigned memkeys_samples; + long long memkeys_samples; int hotkeys; + int keystats; + unsigned long long cursor; + unsigned long top_sizes_limit; int stdin_lastarg; /* get last arg from stdin. (-x option) */ int stdin_tag_arg; /* get arg from stdin. (-X option) */ char *stdin_tag_name; /* Placeholder(tag name) for user input. */ @@ -277,6 +264,8 @@ static struct config { char *server_version; char *test_hint; char *test_hint_file; + int prefer_ipv4; /* Prefer IPv4 over IPv6 on DNS lookup. */ + int prefer_ipv6; /* Prefer IPv6 over IPv4 on DNS lookup. */ } config; /* User preferences. */ @@ -287,8 +276,6 @@ static struct pref { static volatile sig_atomic_t force_cancel_loop = 0; static void usage(int err); static void slaveMode(int send_sync); -char *redisGitSHA1(void); -char *redisGitDirty(void); static int cliConnect(int flags); static char *getInfoField(char *info, char *field); @@ -404,6 +391,37 @@ void dictListDestructor(dict *d, void *val) listRelease((list*)val); } +/* Erase the lines before printing, and returns the number of lines printed */ +int cleanPrintfln(char *fmt, ...) { + va_list args; + char buf[1024]; /* limitation */ + int char_count, line_count = 0; + + /* Clear the line if in TTY */ + if (IS_TTY_OR_FAKETTY()) { + printf("\033[2K\r"); + } + + va_start(args, fmt); + char_count = vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (char_count >= (int)sizeof(buf)) { + fprintf(stderr, "Warning: String was trimmed in cleanPrintln\n"); + } + + char *position, *string = buf; + while ((position = strchr(string, '\n')) != NULL) { + int line_length = (int)(position - string); + printf("%.*s\n", line_length, string); + string = position + 1; + line_count++; + } + + printf("%s\n", string); + return line_count + 1; +} + /*------------------------------------------------------------------------------ * Help functions *--------------------------------------------------------------------------- */ @@ -424,20 +442,6 @@ typedef struct { static helpEntry *helpEntries = NULL; static int helpEntriesLen = 0; -static sds cliVersion(void) { - sds version; - version = sdscatprintf(sdsempty(), "%s", REDIS_VERSION); - - /* Add git commit and working tree status when available */ - if (strtoll(redisGitSHA1(),NULL,16)) { - version = sdscatprintf(version, " (git:%s", redisGitSHA1()); - if (strtoll(redisGitDirty(),NULL,10)) - version = sdscatprintf(version, "-dirty"); - version = sdscat(version, ")"); - } - return version; -} - /* For backwards compatibility with pre-7.0 servers. * cliLegacyInitHelp() sets up the helpEntries array with the command and group * names from the commands.c file. However the Redis instance we are connecting @@ -760,8 +764,13 @@ static int versionIsSupported(sds version, sds since) { } versionPos = strchr(versionPos, '.'); sincePos = strchr(sincePos, '.'); - if (!versionPos || !sincePos) - return 0; + + /* If we finished to parse both `version` and `since`, it means they are equal */ + if (!versionPos && !sincePos) return 1; + + /* Different number of digits considered as not supported */ + if (!versionPos || !sincePos) return 0; + versionPos++; sincePos++; } @@ -778,7 +787,7 @@ static void removeUnsupportedArgs(struct cliCommandArg *args, int *numargs, sds i++; continue; } - for (j = i; j != *numargs; j++) { + for (j = i; j != *numargs - 1; j++) { args[j] = args[j + 1]; } (*numargs)--; @@ -1262,7 +1271,7 @@ static int matchNoTokenArg(char **nextword, int numwords, cliCommandArg *arg) { case ARG_TYPE_INTEGER: case ARG_TYPE_UNIX_TIME: { long long value; - if (sscanf(*nextword, "%lld", &value)) { + if (sscanf(*nextword, "%lld", &value) == 1) { arg->matched += 1; arg->matched_name = 1; arg->matched_all = 1; @@ -1276,7 +1285,7 @@ static int matchNoTokenArg(char **nextword, int numwords, cliCommandArg *arg) { case ARG_TYPE_DOUBLE: { double value; - if (sscanf(*nextword, "%lf", &value)) { + if (sscanf(*nextword, "%lf", &value) == 1) { arg->matched += 1; arg->matched_name = 1; arg->matched_all = 1; @@ -1657,15 +1666,17 @@ static int cliConnect(int flags) { redisFree(context); config.dbnum = 0; config.in_multi = 0; + config.pubsub_mode = 0; cliRefreshPrompt(); } /* Do not use hostsocket when we got redirected in cluster mode */ if (config.hostsocket == NULL || (config.cluster_mode && config.cluster_reissue_command)) { - context = redisConnect(config.conn_info.hostip,config.conn_info.hostport); + context = redisConnectWrapper(config.conn_info.hostip, config.conn_info.hostport, + config.connect_timeout); } else { - context = redisConnectUnix(config.hostsocket); + context = redisConnectUnixWrapper(config.hostsocket, config.connect_timeout); } if (!context->err && config.tls) { @@ -2291,8 +2302,12 @@ static int cliReadReply(int output_raw_strings) { slot = atoi(s+1); s = strrchr(p+1,':'); /* MOVED 3999[P]127.0.0.1[S]6381 */ *s = '\0'; - sdsfree(config.conn_info.hostip); - config.conn_info.hostip = sdsnew(p+1); + if (p+1 != s) { + /* Host might be empty, like 'MOVED 3999 :6381', if endpoint type is unknown. Only update the + * host if it's non-empty. */ + sdsfree(config.conn_info.hostip); + config.conn_info.hostip = sdsnew(p+1); + } config.conn_info.hostport = atoi(s+1); if (config.interactive) printf("-> Redirected to slot [%d] located at %s:%d\n", @@ -2604,7 +2619,8 @@ static redisReply *reconnectingRedisCommand(redisContext *c, const char *fmt, .. fflush(stdout); redisFree(c); - c = redisConnect(config.conn_info.hostip,config.conn_info.hostport); + c = redisConnectWrapper(config.conn_info.hostip, config.conn_info.hostport, + config.connect_timeout); if (!c->err && config.tls) { const char *err = NULL; if (cliSecureConnection(c, config.sslconfig, &err) == REDIS_ERR && err) { @@ -2659,6 +2675,15 @@ static int parseOptions(int argc, char **argv) { fprintf(stderr, "Invalid server port.\n"); exit(1); } + } else if (!strcmp(argv[i],"-t") && !lastarg) { + char *eptr; + double seconds = strtod(argv[++i], &eptr); + if (eptr[0] != '\0' || isnan(seconds) || seconds < 0.0) { + fprintf(stderr, "Invalid connection timeout for -t.\n"); + exit(1); + } + config.connect_timeout.tv_sec = (long long)seconds; + config.connect_timeout.tv_usec = ((long long)(seconds * 1000000)) % 1000000; } else if (!strcmp(argv[i],"-s") && !lastarg) { config.hostsocket = argv[++i]; } else if (!strcmp(argv[i],"-r") && !lastarg) { @@ -2754,12 +2779,63 @@ static int parseOptions(int argc, char **argv) { config.bigkeys = 1; } else if (!strcmp(argv[i],"--memkeys")) { config.memkeys = 1; - config.memkeys_samples = 0; /* use redis default */ + config.memkeys_samples = -1; /* use redis default */ } else if (!strcmp(argv[i],"--memkeys-samples") && !lastarg) { + char *endptr; config.memkeys = 1; - config.memkeys_samples = atoi(argv[++i]); + config.keystats = 1; + config.memkeys_samples = strtoll(argv[++i], &endptr, 10); + if (*endptr) { + fprintf(stderr, "--memkeys-samples conversion error.\n"); + exit(1); + } + if (config.memkeys_samples < 0) { + fprintf(stderr, "--memkeys-samples value should be positive.\n"); + exit(1); + } } else if (!strcmp(argv[i],"--hotkeys")) { config.hotkeys = 1; + } else if (!strcmp(argv[i], "--keystats")) { + config.keystats = 1; + config.memkeys_samples = -1; /* use redis default */ + } else if (!strcmp(argv[i],"--keystats-samples") && !lastarg) { + char *endptr; + config.keystats = 1; + config.memkeys_samples = strtoll(argv[++i], &endptr, 10); + if (*endptr) { + fprintf(stderr, "--keystats-samples conversion error.\n"); + exit(1); + } + if (config.memkeys_samples < 0) { + fprintf(stderr, "--keystats-samples value should be positive.\n"); + exit(1); + } + } else if (!strcmp(argv[i],"--cursor") && !lastarg) { + i++; + char sign = *argv[i]; + char *endptr; + config.cursor = strtoull(argv[i], &endptr, 10); + if (*endptr) { + fprintf(stderr, "--cursor conversion error.\n"); + exit(1); + } + if (sign == '-' && config.cursor != 0) { + fprintf(stderr, "--cursor should be followed by a positive integer.\n"); + exit(1); + } + } else if (!strcmp(argv[i],"--top") && !lastarg) { + i++; + char sign = *argv[i]; + char *endptr; + config.top_sizes_limit = strtoull(argv[i], &endptr, 10); + if (*endptr) { + fprintf(stderr, "--top conversion error.\n"); + exit(1); + } + if (sign == '-' && config.top_sizes_limit != 0) { + fprintf(stderr, "--top should be followed by a positive integer.\n"); + exit(1); + } } else if (!strcmp(argv[i],"--eval") && !lastarg) { config.eval = argv[++i]; } else if (!strcmp(argv[i],"--ldb")) { @@ -2781,6 +2857,10 @@ static int parseOptions(int argc, char **argv) { config.set_errcode = 1; } else if (!strcmp(argv[i],"--verbose")) { config.verbose = 1; + } else if (!strcmp(argv[i],"-4")) { + config.prefer_ipv4 = 1; + } else if (!strcmp(argv[i],"-6")) { + config.prefer_ipv6 = 1; } else if (!strcmp(argv[i],"--cluster") && !lastarg) { if (CLUSTER_MANAGER_MODE()) usage(1); char *cmd = argv[++i]; @@ -2965,6 +3045,11 @@ static int parseOptions(int argc, char **argv) { exit(1); } + if (config.prefer_ipv4 && config.prefer_ipv6) { + fprintf(stderr, "Options -4 and -6 are mutually exclusive.\n"); + exit(1); + } + return i; } @@ -3013,6 +3098,8 @@ static void usage(int err) { "Usage: redis-cli [OPTIONS] [cmd [arg [arg ...]]]\n" " -h Server hostname (default: 127.0.0.1).\n" " -p Server port (default: 6379).\n" +" -t Server connection timeout in seconds (decimals allowed).\n" +" Default timeout is 0, meaning no limit, depending on the OS.\n" " -s Server socket (overrides hostname and port).\n" " -a Password to use when connecting to the server.\n" " You can also use the " REDIS_CLI_AUTH_ENV " environment\n" @@ -3023,7 +3110,10 @@ static void usage(int err) { " --askpass Force user to input password with mask from STDIN.\n" " If this argument is used, '-a' and " REDIS_CLI_AUTH_ENV "\n" " environment variable will be ignored.\n" -" -u Server URI.\n" +" -u Server URI on format redis://user:password@host:port/dbnum\n" +" User, password and dbnum are optional. For authentication\n" +" without a username, use username 'default'. For TLS, use\n" +" the scheme 'rediss'.\n" " -r Execute specified command N times.\n" " -i When -r is used, waits seconds per command.\n" " It is possible to specify sub-second times like -i 0.1.\n" @@ -3038,6 +3128,8 @@ static void usage(int err) { " -D Delimiter between responses for raw formatting (default: \\n).\n" " -c Enable cluster mode (follow -ASK and -MOVED redirections).\n" " -e Return exit error code when command execution fails.\n" +" -4 Prefer IPv4 over IPv6 on DNS lookup.\n" +" -6 Prefer IPv6 over IPv4 on DNS lookup.\n" "%s" " --raw Use raw formatting for replies (default when STDOUT is\n" " not a tty).\n" @@ -3079,6 +3171,13 @@ version,tls_usage); " --memkeys Sample Redis keys looking for keys consuming a lot of memory.\n" " --memkeys-samples Sample Redis keys looking for keys consuming a lot of memory.\n" " And define number of key elements to sample\n" +" --keystats Sample Redis keys looking for keys memory size and length (combine bigkeys and memkeys).\n" +" --keystats-samples Sample Redis keys looking for keys memory size and length.\n" +" And define number of key elements to sample (only for memory usage).\n" +" --cursor Start the scan at the cursor (usually after a Ctrl-C).\n" +" Optionally used with --keystats and --keystats-samples.\n" +" --top To display top key sizes (default: 10).\n" +" Optionally used with --keystats and --keystats-samples.\n" " --hotkeys Sample Redis keys looking for hot keys.\n" " only works when maxmemory-policy is *lfu.\n" " --scan List all keys using the SCAN command.\n" @@ -3108,6 +3207,7 @@ version,tls_usage); " Use --cluster help to list all available cluster manager commands.\n" "\n" "Examples:\n" +" redis-cli -u redis://default:PASSWORD@localhost:6379/0\n" " cat /etc/passwd | redis-cli -x set mypasswd\n" " redis-cli -D \"\" --raw dump key > key.dump && redis-cli -X dump_tag restore key2 0 dump_tag replace < key.dump\n" " redis-cli -r 100 lpush mylist x\n" @@ -3257,16 +3357,20 @@ void cliLoadPreferences(void) { /* Some commands can include sensitive information and shouldn't be put in the * history file. Currently these commands are include: * - AUTH - * - ACL SETUSER - * - CONFIG SET masterauth/masteruser/requirepass + * - ACL DELUSER, ACL SETUSER, ACL GETUSER + * - CONFIG SET masterauth/masteruser/tls-key-file-pass/tls-client-key-file-pass/requirepass * - HELLO with [AUTH username password] - * - MIGRATE with [AUTH password] or [AUTH2 username password] */ + * - MIGRATE with [AUTH password] or [AUTH2 username password] + * - SENTINEL CONFIG SET sentinel-pass password, SENTINEL CONFIG SET sentinel-user username + * - SENTINEL SET auth-pass password, SENTINEL SET auth-user username */ static int isSensitiveCommand(int argc, char **argv) { if (!strcasecmp(argv[0],"auth")) { return 1; } else if (argc > 1 && - !strcasecmp(argv[0],"acl") && - !strcasecmp(argv[1],"setuser")) + !strcasecmp(argv[0],"acl") && ( + !strcasecmp(argv[1],"deluser") || + !strcasecmp(argv[1],"setuser") || + !strcasecmp(argv[1],"getuser"))) { return 1; } else if (argc > 2 && @@ -3274,8 +3378,10 @@ static int isSensitiveCommand(int argc, char **argv) { !strcasecmp(argv[1],"set")) { for (int j = 2; j < argc; j = j+2) { if (!strcasecmp(argv[j],"masterauth") || - !strcasecmp(argv[j],"masteruser") || - !strcasecmp(argv[j],"requirepass")) { + !strcasecmp(argv[j],"masteruser") || + !strcasecmp(argv[j],"tls-key-file-pass") || + !strcasecmp(argv[j],"tls-client-key-file-pass") || + !strcasecmp(argv[j],"requirepass")) { return 1; } } @@ -3305,6 +3411,24 @@ static int isSensitiveCommand(int argc, char **argv) { return 0; } } + } else if (argc > 4 && !strcasecmp(argv[0], "sentinel")) { + /* SENTINEL CONFIG SET sentinel-pass password + * SENTINEL CONFIG SET sentinel-user username */ + if (!strcasecmp(argv[1], "config") && + !strcasecmp(argv[2], "set") && + (!strcasecmp(argv[3], "sentinel-pass") || + !strcasecmp(argv[3], "sentinel-user"))) + { + return 1; + } + /* SENTINEL SET auth-pass password + * SENTINEL SET auth-user username */ + if (!strcasecmp(argv[1], "set") && + (!strcasecmp(argv[3], "auth-pass") || + !strcasecmp(argv[3], "auth-user"))) + { + return 1; + } } return 0; } @@ -3331,7 +3455,7 @@ static void repl(void) { linenoiseSetFreeHintsCallback(freeHintsCallback); /* Only use history and load the rc file when stdin is a tty. */ - if (isatty(fileno(stdin))) { + if (getenv("FAKETTY_WITH_PROMPT") != NULL || isatty(fileno(stdin))) { historyfile = getDotfilePath(REDIS_CLI_HISTFILE_ENV,REDIS_CLI_HISTFILE_DEFAULT); //keep in-memory history always regardless if history file can be determined history = 1; @@ -3361,7 +3485,7 @@ static void repl(void) { if (argv == NULL) { printf("Invalid argument(s)\n"); fflush(stdout); - if (history) linenoiseHistoryAdd(line); + if (history) linenoiseHistoryAdd(line, 0); if (historyfile) linenoiseHistorySave(historyfile); linenoiseFree(line); continue; @@ -3387,10 +3511,11 @@ static void repl(void) { repeat = 1; } - if (!isSensitiveCommand(argc - skipargs, argv + skipargs)) { - if (history) linenoiseHistoryAdd(line); - if (historyfile) linenoiseHistorySave(historyfile); - } + /* Always keep in-memory history. But for commands with sensitive information, + * avoid writing them to the history file. */ + int is_sensitive = isSensitiveCommand(argc - skipargs, argv + skipargs); + if (history) linenoiseHistoryAdd(line, is_sensitive); + if (!is_sensitive && historyfile) linenoiseHistorySave(historyfile); if (strcasecmp(argv[0],"quit") == 0 || strcasecmp(argv[0],"exit") == 0) @@ -3736,7 +3861,7 @@ typedef struct clusterManagerCommandDef { } clusterManagerCommandDef; clusterManagerCommandDef clusterManagerCommands[] = { - {"create", clusterManagerCommandCreate, -2, "host1:port1 ... hostN:portN", + {"create", clusterManagerCommandCreate, -1, "host1:port1 ... hostN:portN", "replicas "}, {"check", clusterManagerCommandCheck, -1, " or - separated by either colon or space", "search-multiple-owners"}, @@ -4043,7 +4168,7 @@ static int clusterManagerExecTransaction(clusterManagerNode *node, static int clusterManagerNodeConnect(clusterManagerNode *node) { if (node->context) redisFree(node->context); - node->context = redisConnect(node->ip, node->port); + node->context = redisConnectWrapper(node->ip, node->port, config.connect_timeout); if (!node->context->err && config.tls) { const char *err = NULL; if (cliSecureConnection(node->context, config.sslconfig, &err) == REDIS_ERR && err) { @@ -4573,7 +4698,7 @@ static void clusterManagerShowNodes(void) { static void clusterManagerShowClusterInfo(void) { int masters = 0; - int keys = 0; + long long keys = 0; listIter li; listNode *ln; listRewind(cluster_manager.nodes, &li); @@ -4582,7 +4707,7 @@ static void clusterManagerShowClusterInfo(void) { if (!(node->flags & CLUSTER_MANAGER_FLAG_SLAVE)) { if (!node->name) continue; int replicas = 0; - int dbsize = -1; + long long dbsize = -1; char name[9]; memcpy(name, node->name, 8); name[8] = '\0'; @@ -4608,14 +4733,14 @@ static void clusterManagerShowClusterInfo(void) { return; }; if (reply != NULL) freeReplyObject(reply); - printf("%s:%d (%s...) -> %d keys | %d slots | %d slaves.\n", + printf("%s:%d (%s...) -> %lld keys | %d slots | %d slaves.\n", node->ip, node->port, name, dbsize, node->slots_count, replicas); masters++; keys += dbsize; } } - clusterManagerLogOk("[OK] %d keys in %d masters.\n", keys, masters); + clusterManagerLogOk("[OK] %lld keys in %d masters.\n", keys, masters); float keys_per_slot = keys / (float) CLUSTER_MANAGER_SLOTS; printf("%.2f keys per slot on average.\n", keys_per_slot); } @@ -7055,7 +7180,10 @@ static int clusterManagerCommandCreate(int argc, char **argv) { first = node; /* Although hiredis supports connecting to a hostname, CLUSTER * MEET requires an IP address, so we do a DNS lookup here. */ - if (anetResolve(NULL, first->ip, first_ip, sizeof(first_ip), ANET_NONE) + int anet_flags = ANET_NONE; + if (config.prefer_ipv4) anet_flags |= ANET_PREFER_IPV4; + if (config.prefer_ipv6) anet_flags |= ANET_PREFER_IPV6; + if (anetResolve(NULL, first->ip, first_ip, sizeof(first_ip), anet_flags) == ANET_ERR) { fprintf(stderr, "Invalid IP address or hostname specified: %s\n", first->ip); @@ -7250,7 +7378,10 @@ static int clusterManagerCommandAddNode(int argc, char **argv) { "join the cluster.\n", ip, port); /* CLUSTER MEET requires an IP address, so we do a DNS lookup here. */ char first_ip[NET_IP_STR_LEN]; - if (anetResolve(NULL, first->ip, first_ip, sizeof(first_ip), ANET_NONE) == ANET_ERR) { + int anet_flags = ANET_NONE; + if (config.prefer_ipv4) anet_flags |= ANET_PREFER_IPV4; + if (config.prefer_ipv6) anet_flags |= ANET_PREFER_IPV6; + if (anetResolve(NULL, first->ip, first_ip, sizeof(first_ip), anet_flags) == ANET_ERR) { fprintf(stderr, "Invalid IP address or hostname specified: %s\n", first->ip); success = 0; goto cleanup; @@ -7862,7 +7993,7 @@ static int clusterManagerCommandImport(int argc, char **argv) { char *reply_err = NULL; redisReply *src_reply = NULL; // Connect to the source node. - redisContext *src_ctx = redisConnect(src_ip, src_port); + redisContext *src_ctx = redisConnectWrapper(src_ip, src_port, config.connect_timeout); if (src_ctx->err) { success = 0; fprintf(stderr,"Could not connect to Redis at %s:%d: %s.\n", src_ip, @@ -8834,7 +8965,8 @@ static redisReply *sendScan(unsigned long long *it) { reply = redisCommand(context, "SCAN %llu MATCH %b COUNT %d", *it, config.pattern, sdslen(config.pattern), config.count); else - reply = redisCommand(context,"SCAN %llu",*it); + reply = redisCommand(context, "SCAN %llu COUNT %d", + *it, config.count); /* Handle any error conditions */ if(reply == NULL) { @@ -8885,6 +9017,28 @@ static int getDbSize(void) { return size; } +static int getDatabases(void) { + redisReply *reply; + int dbnum; + + reply = redisCommand(context, "CONFIG GET databases"); + + if (reply == NULL) { + fprintf(stderr, "\nI/O error\n"); + exit(1); + } else if (reply->type == REDIS_REPLY_ERROR) { + dbnum = 16; + fprintf(stderr, "CONFIG GET databases fails: %s, use default value 16 instead\n", reply->str); + } else { + assert(reply->type == (config.current_resp3 ? REDIS_REPLY_MAP : REDIS_REPLY_ARRAY)); + assert(reply->elements == 2); + dbnum = atoi(reply->element[1]->str); + } + + freeReplyObject(reply); + return dbnum; +} + typedef struct { char *name; char *sizecmd; @@ -8973,7 +9127,7 @@ static void getKeyTypes(dict *types_dict, redisReply *keys, typeinfo **types) { static void getKeySizes(redisReply *keys, typeinfo **types, unsigned long long *sizes, int memkeys, - unsigned memkeys_samples) + long long memkeys_samples) { redisReply *reply; unsigned int i; @@ -8988,7 +9142,7 @@ static void getKeySizes(redisReply *keys, typeinfo **types, const char* argv[] = {types[i]->sizecmd, keys->element[i]->str}; size_t lens[] = {strlen(types[i]->sizecmd), keys->element[i]->len}; redisAppendCommandArgv(context, 2, argv, lens); - } else if (memkeys_samples==0) { + } else if (memkeys_samples == -1) { const char* argv[] = {"MEMORY", "USAGE", keys->element[i]->str}; size_t lens[] = {6, 5, keys->element[i]->len}; redisAppendCommandArgv(context, 3, argv, lens); @@ -9035,7 +9189,27 @@ static void longStatLoopModeStop(int s) { force_cancel_loop = 1; } -static void findBigKeys(int memkeys, unsigned memkeys_samples) { +/* In cluster mode we may need to send the READONLY command. + Ignore the error in case the server isn't using cluster mode. */ +static void sendReadOnly(void) { + redisReply *read_reply; + read_reply = redisCommand(context, "READONLY"); + if (read_reply == NULL){ + fprintf(stderr, "\nI/O error\n"); + exit(1); + } else if (read_reply->type == REDIS_REPLY_ERROR && + strcmp(read_reply->str, "ERR This instance has cluster support disabled") != 0 && + strncmp(read_reply->str, "ERR unknown command", 19) != 0) { + fprintf(stderr, "Error: %s\n", read_reply->str); + exit(1); + } + freeReplyObject(read_reply); +} + +static int displayKeyStatsProgressbar(unsigned long long sampled, + unsigned long long total_keys); + +static void findBigKeys(int memkeys, long long memkeys_samples) { unsigned long long sampled = 0, total_keys, totlen=0, *sizes=NULL, it=0, scan_loops = 0; redisReply *reply, *keys; unsigned int arrsize=0, i; @@ -9043,6 +9217,7 @@ static void findBigKeys(int memkeys, unsigned memkeys_samples) { dictEntry *de; typeinfo **types = NULL; double pct; + long long refresh_time = mstime(); dict *types_dict = dictCreate(&typeinfoDictType); typeinfo_add(types_dict, "string", &type_string); @@ -9060,6 +9235,9 @@ static void findBigKeys(int memkeys, unsigned memkeys_samples) { printf("\n# Scanning the entire keyspace to find biggest keys as well as\n"); printf("# average sizes per key type. You can use -i 0.1 to sleep 0.1 sec\n"); printf("# per 100 SCAN commands (not usually needed).\n\n"); + + /* Use readonly in cluster */ + sendReadOnly(); /* SCAN loop */ do { @@ -9110,19 +9288,44 @@ static void findBigKeys(int memkeys, unsigned memkeys_samples) { exit(1); } - printf( - "[%05.2f%%] Biggest %-6s found so far '%s' with %llu %s\n", - pct, type->name, type->biggest_key, sizes[i], - !memkeys? type->sizeunit: "bytes"); + /* We only show the original progress output when writing to a file */ + if (!IS_TTY_OR_FAKETTY()) { + printf("[%05.2f%%] Biggest %-6s found so far %s with %llu %s\n", + pct, type->name, type->biggest_key, sizes[i], + !memkeys? type->sizeunit: "bytes"); + } /* Keep track of the biggest size for this type */ type->biggest = sizes[i]; } - /* Update overall progress */ - if(sampled % 1000000 == 0) { + /* Update overall progress + * We only show the original progress output when writing to a file */ + if (sampled % 1000000 == 0 && !IS_TTY_OR_FAKETTY()) { printf("[%05.2f%%] Sampled %llu keys so far\n", pct, sampled); } + + /* Show the progress bar in TTY */ + if (mstime() > refresh_time + REFRESH_INTERVAL && IS_TTY_OR_FAKETTY()) { + int line_count = 0; + refresh_time = mstime(); + + line_count = displayKeyStatsProgressbar(sampled, total_keys); + line_count += cleanPrintfln(""); + + di = dictGetIterator(types_dict); + while ((de = dictNext(di))) { + typeinfo *current_type = dictGetVal(de); + if (current_type->biggest > 0) { + line_count += cleanPrintfln("Biggest %-9s found so far %s with %llu %s", + current_type->name, current_type->biggest_key, current_type->biggest, + !memkeys? current_type->sizeunit: "bytes"); + } + } + dictReleaseIterator(di); + + printf("\033[%dA\r", line_count); + } } /* Sleep if we've been directed to do so */ @@ -9133,13 +9336,31 @@ static void findBigKeys(int memkeys, unsigned memkeys_samples) { freeReplyObject(reply); } while(force_cancel_loop == 0 && it != 0); + /* Final progress bar if TTY */ + if (IS_TTY_OR_FAKETTY()) { + displayKeyStatsProgressbar(sampled, total_keys); + + /* Clean the types info shown during the progress bar */ + int line_count = 0; + di = dictGetIterator(types_dict); + while ((de = dictNext(di))) + line_count += cleanPrintfln(""); + dictReleaseIterator(di); + printf("\033[%dA\r", line_count); + } + if(types) zfree(types); if(sizes) zfree(sizes); /* We're done */ printf("\n-------- summary -------\n\n"); - if (force_cancel_loop) printf("[%05.2f%%] ", pct); - printf("Sampled %llu keys in the keyspace!\n", sampled); + + /* Show percentage and sampled output when writing to a file */ + if (!IS_TTY_OR_FAKETTY()) { + if (force_cancel_loop) printf("[%05.2f%%] ", pct); + printf("Sampled %llu keys in the keyspace!\n", sampled); + } + printf("Total key length in bytes is %llu (avg len %.2f)\n\n", totlen, totlen ? (double)totlen/sampled : 0); @@ -9148,7 +9369,7 @@ static void findBigKeys(int memkeys, unsigned memkeys_samples) { while ((de = dictNext(di))) { typeinfo *type = dictGetVal(de); if(type->biggest_key) { - printf("Biggest %6s found '%s' has %llu %s\n", type->name, type->biggest_key, + printf("Biggest %6s found %s has %llu %s\n", type->name, type->biggest_key, type->biggest, !memkeys? type->sizeunit: "bytes"); } } @@ -9214,8 +9435,10 @@ static void findHotKeys(void) { unsigned long long counters[HOTKEYS_SAMPLE] = {0}; sds hotkeys[HOTKEYS_SAMPLE] = {NULL}; unsigned long long sampled = 0, total_keys, *freqs = NULL, it = 0, scan_loops = 0; - unsigned int arrsize = 0, i, k; + unsigned int arrsize = 0, i; + int k; double pct; + long long refresh_time = mstime(); signal(SIGINT, longStatLoopModeStop); /* Total keys pre scanning */ @@ -9226,6 +9449,9 @@ static void findHotKeys(void) { printf("# average sizes per key type. You can use -i 0.1 to sleep 0.1 sec\n"); printf("# per 100 SCAN commands (not usually needed).\n\n"); + /* Use readonly in cluster */ + sendReadOnly(); + /* SCAN loop */ do { /* Calculate approximate percentage completion */ @@ -9253,8 +9479,10 @@ static void findHotKeys(void) { /* Now update our stats */ for(i=0;ielements;i++) { sampled++; - /* Update overall progress */ - if(sampled % 1000000 == 0) { + + /* Update overall progress. + * Only show the original progress output when writing to a file */ + if (sampled % 1000000 == 0 && !IS_TTY_OR_FAKETTY()) { printf("[%05.2f%%] Sampled %llu keys so far\n", pct, sampled); } @@ -9272,9 +9500,30 @@ static void findHotKeys(void) { } counters[k] = freqs[i]; hotkeys[k] = sdscatrepr(sdsempty(), keys->element[i]->str, keys->element[i]->len); - printf( - "[%05.2f%%] Hot key '%s' found so far with counter %llu\n", - pct, hotkeys[k], freqs[i]); + + /* Only show the original progress output when writing to a file */ + if (!IS_TTY_OR_FAKETTY()) { + printf("[%05.2f%%] Hot key %s found so far with counter %llu\n", + pct, hotkeys[k], freqs[i]); + } + } + + /* Show the progress bar in TTY */ + if (mstime() > refresh_time + REFRESH_INTERVAL && IS_TTY_OR_FAKETTY()) { + int line_count = 0; + refresh_time = mstime(); + + line_count = displayKeyStatsProgressbar(sampled, total_keys); + line_count += cleanPrintfln(""); + + for (k = HOTKEYS_SAMPLE - 1; k >= 0; k--) { + if (counters[k] > 0) { + line_count += cleanPrintfln("hot key found with counter: %llu\tkeyname: %s", + counters[k], hotkeys[k]); + } + } + + printf("\033[%dA\r", line_count); } /* Sleep if we've been directed to do so */ @@ -9285,16 +9534,30 @@ static void findHotKeys(void) { freeReplyObject(reply); } while(force_cancel_loop ==0 && it != 0); + /* Final progress bar in TTY */ + if (IS_TTY_OR_FAKETTY()) { + displayKeyStatsProgressbar(sampled, total_keys); + + /* clean the types info shown during the progress bar */ + int line_count = 0; + for (k = 0; k <= HOTKEYS_SAMPLE; k++) + line_count += cleanPrintfln(""); + printf("\033[%dA\r", line_count); + } + if (freqs) zfree(freqs); /* We're done */ printf("\n-------- summary -------\n\n"); - if(force_cancel_loop)printf("[%05.2f%%] ",pct); - printf("Sampled %llu keys in the keyspace!\n", sampled); - for (i=1; i<= HOTKEYS_SAMPLE; i++) { - k = HOTKEYS_SAMPLE - i; - if(counters[k]>0) { + /* Show the original output when writing to a file */ + if (!IS_TTY_OR_FAKETTY()) { + if(force_cancel_loop) printf("[%05.2f%%] ",pct); + printf("Sampled %llu keys in the keyspace!\n", sampled); + } + + for (k = HOTKEYS_SAMPLE - 1; k >= 0; k--) { + if (counters[k] > 0) { printf("hot key found with counter: %llu\tkeyname: %s\n", counters[k], hotkeys[k]); sdsfree(hotkeys[k]); } @@ -9339,9 +9602,11 @@ static long getLongInfoField(char *info, char *field) { } /* Convert number of bytes into a human readable string of the form: - * 100B, 2G, 100M, 4K, and so forth. */ -void bytesToHuman(char *s, size_t size, long long n) { + * 1003B, 4.03K, 100.00M, 2.32G, 3.01T + * Returns the parameter `s` containing the converted number. */ +char *bytesToHuman(char *s, size_t size, long long n) { double d; + char *r = s; if (n < 0) { *s = '-'; @@ -9351,7 +9616,6 @@ void bytesToHuman(char *s, size_t size, long long n) { if (n < 1024) { /* Bytes */ snprintf(s,size,"%lldB",n); - return; } else if (n < (1024*1024)) { d = (double)n/(1024); snprintf(s,size,"%.2fK",d); @@ -9361,12 +9625,18 @@ void bytesToHuman(char *s, size_t size, long long n) { } else if (n < (1024LL*1024*1024*1024)) { d = (double)n/(1024LL*1024*1024); snprintf(s,size,"%.2fG",d); + } else if (n < (1024LL*1024*1024*1024*1024)) { + d = (double)n/(1024LL*1024*1024*1024); + snprintf(s,size,"%.2fT",d); } + + return r; } static void statMode(void) { redisReply *reply; long aux, requests = 0; + int dbnum = getDatabases(); int i = 0; while(1) { @@ -9390,7 +9660,7 @@ static void statMode(void) { /* Keys */ aux = 0; - for (j = 0; j < 20; j++) { + for (j = 0; j < dbnum; j++) { long k; snprintf(buf,sizeof(buf),"db%d:keys",j); @@ -9742,6 +10012,559 @@ void testHintSuite(char *filename) { exit(fail); } +/*------------------------------------------------------------------------------ + * Keystats + *--------------------------------------------------------------------------- */ + +/* Key name length distribution. */ + +typedef struct size_dist_entry { + unsigned long long size; /* Key name size in bytes. */ + unsigned long long count; /* Number of key names that are less or equal to the size. */ +} size_dist_entry; + +typedef struct size_dist { + unsigned long long total_count; /* Total number of key names in the distribution. */ + unsigned long long total_size; /* Sum of all the key name sizes in bytes. */ + unsigned long long max_size; /* Highest key name size in bytes. */ + size_dist_entry *size_dist; /* Array of sizes and key names count per size. */ +} size_dist; + +/* distribution is an array initialized with last element {0, 0} + * for instance: size_dist_entry distribution[] = { {32, 0}, {256, 0}, {0, 0} }; */ +static void sizeDistInit(size_dist *dist, size_dist_entry *distribution) { + dist->max_size = 0; + dist->total_count = 0; + dist->total_size = 0; + dist->size_dist = distribution; +} + +static void addSizeDist(size_dist *dist, unsigned long long size) { + dist->total_count++; + dist->total_size += size; + + if (size > dist->max_size) + dist->max_size = size; + + int j; + for (j=0; dist->size_dist[j].size && size > dist->size_dist[j].size; j++); + dist->size_dist[j].count++; +} + +static int displayKeyStatsLengthDist(size_dist *dist) { + int line_count = 0; + unsigned long long total_keys = 0, size; + char buf[2][256]; + + line_count += cleanPrintfln("Key name length Percentile Total keys"); + line_count += cleanPrintfln("--------------- ---------- -----------"); + + for (int i=0; dist->size_dist[i].size; i++) { + if (dist->size_dist[i].count) { + if (dist->max_size < dist->size_dist[i].size) { + size = dist->max_size; + } else { + size = dist->size_dist[i].size; + } + total_keys += dist->size_dist[i].count; + line_count += cleanPrintfln("%15s %9.4f%% %11llu", + bytesToHuman(buf[1], sizeof(buf[1]), size), + (double)100 * total_keys / dist->total_count, + total_keys); + } + } + + if (total_keys < dist->total_count) { + line_count += cleanPrintfln(" inf %9.4f%% %11llu", 100.0, dist->total_count); + } + + line_count += cleanPrintfln("Total key length is %s (%s avg)", + bytesToHuman(buf[0], sizeof(buf[0]), dist->total_size), + dist->total_count ? bytesToHuman(buf[1], sizeof(buf[1]), dist->total_size/dist->total_count) : "0"); + + return line_count; +} + +#define PROGRESSBAR_WIDTH 60 +static int displayKeyStatsProgressbar(unsigned long long sampled, + unsigned long long total_keys) +{ + int line_count = 0; + char progressbar[512]; + char buf[2][128]; + + /* We can go over 100% if keys are added in the middle of the scans. + * Cap at 100% or the progressbar memset will overflow. */ + double completion_pct = total_keys ? sampled < total_keys ? (double) sampled/total_keys : 1 : 0; + + /* If we are not redirecting to a file, build the progress bar */ + if (IS_TTY_OR_FAKETTY()) { + int completed_width = (int)round(PROGRESSBAR_WIDTH * completion_pct); + memset(buf[0], '|', completed_width); + buf[0][completed_width]= '\0'; + + int uncompleted_width = PROGRESSBAR_WIDTH - completed_width; + memset(buf[1], '-', uncompleted_width); + buf[1][uncompleted_width]= '\0'; + + char red[] = "\033[31m"; + char green[] = "\033[32m"; + char default_color[] = "\033[39m"; + snprintf(progressbar, sizeof(progressbar), "%s%s%s%s%s", + green, buf[0], red, buf[1], default_color); + } else { + snprintf(progressbar, sizeof(progressbar), "%s", "keys scanned"); + } + + line_count += cleanPrintfln("%6.2f%% %s", completion_pct * 100, progressbar); + line_count += cleanPrintfln("Keys sampled: %llu", sampled); + + return line_count; +} + +static int displayKeyStatsSizeType(dict *memkeys_types_dict) { + dictIterator *di; + dictEntry *de; + int line_count = 0; + char buf[256]; + + line_count += cleanPrintfln("--- Top size per type ---"); + di = dictGetIterator(memkeys_types_dict); + while ((de = dictNext(di))) { + typeinfo *type = dictGetVal(de); + if (type->biggest_key) { + line_count += cleanPrintfln("%-10s %s is %s", + type->name, type->biggest_key, + bytesToHuman(buf, sizeof(buf),type->biggest)); + } + } + dictReleaseIterator(di); + + return line_count; +} + +static int displayKeyStatsLengthType(dict *bigkeys_types_dict) { + dictIterator *di; + dictEntry *de; + int line_count = 0; + char buf[256]; + + line_count += cleanPrintfln("--- Top length and cardinality per type ---"); + di = dictGetIterator(bigkeys_types_dict); + while ((de = dictNext(di))) { + typeinfo *type = dictGetVal(de); + if (type->biggest_key) { + if (!strcmp(type->sizeunit, "bytes")) { + bytesToHuman(buf, sizeof(buf), type->biggest); + } else { + snprintf(buf, sizeof(buf), "%llu %s", type->biggest, type->sizeunit); + } + line_count += cleanPrintfln("%-10s %s has %s", type->name, type->biggest_key, buf); + } + } + dictReleaseIterator(di); + + return line_count; +} + +static int displayKeyStatsSizeDist(struct hdr_histogram *keysize_histogram) { + int line_count = 0; + double percentile; + char size[32], mean[32], stddev[32]; + struct hdr_iter iter; + int64_t last_displayed_cumulative_count = 0; + + hdr_iter_percentile_init(&iter, keysize_histogram, 1); + + line_count += cleanPrintfln("Key size Percentile Total keys"); + line_count += cleanPrintfln("-------- ---------- -----------"); + + while (hdr_iter_next(&iter)) { + /* Skip repeat in hdr_histogram cumulative_count, and set the last line + * to 100% when total_count is reached. For instance: + * 140.68K 99.9969% 50013 + * 140.68K 99.9977% 50013 + * 2.04G 99.9985% 50014 + * 2.04G 100.0000% 50014 + * Will display: + * 140.68K 99.9969% 50013 + * 2.04G 100.0000% 50014 */ + + if (iter.cumulative_count != last_displayed_cumulative_count) { + if (iter.cumulative_count == iter.h->total_count) { + percentile = 100; + } else { + percentile = iter.specifics.percentiles.percentile; + } + + line_count += cleanPrintfln("%8s %9.4f%% %11lld", + bytesToHuman(size, sizeof(size), iter.highest_equivalent_value), + percentile, + iter.cumulative_count); + + last_displayed_cumulative_count = iter.cumulative_count; + } + } + + bytesToHuman(mean, sizeof(mean),hdr_mean(keysize_histogram)); + bytesToHuman(stddev, sizeof(stddev),hdr_stddev(keysize_histogram)); + line_count += cleanPrintfln("Note: 0.01%% size precision, Mean: %s, StdDeviation: %s", mean, stddev); + + return line_count; +} + +static int displayKeyStatsType(unsigned long long sampled, + dict *memkeys_types_dict, + dict *bigkeys_types_dict) +{ + dictIterator *di; + dictEntry *de; + int line_count = 0; + char total_size[64], size_avg[64], total_length[64], length_avg[64]; + + line_count += cleanPrintfln("Type Total keys Keys %% Tot size Avg size Total length/card Avg ln/card"); + line_count += cleanPrintfln("--------- ------------ ------- -------- -------- ------------------ -----------"); + + di = dictGetIterator(memkeys_types_dict); + while ((de = dictNext(di))) { + typeinfo *memkey_type = dictGetVal(de); + if (memkey_type->count) { + /* Key count, percentage, memkeys info */ + bytesToHuman(total_size, sizeof(total_size), memkey_type->totalsize); + bytesToHuman(size_avg, sizeof(size_avg), memkey_type->totalsize/memkey_type->count); + + strncpy(total_length, " - ", sizeof(total_length)); + strncpy(length_avg, " - ", sizeof(length_avg)); + + /* bigkeys info */ + dictEntry *bk_de = dictFind(bigkeys_types_dict, memkey_type->name); + if (bk_de) { /* If we have it in memkeys it should be in bigkeys */ + typeinfo *bigkey_type = dictGetVal(bk_de); + if (bigkey_type->sizecmd && bigkey_type->count) { + double avg = (double)bigkey_type->totalsize/bigkey_type->count; + if (!strcmp(bigkey_type->sizeunit, "bytes")) { + bytesToHuman(total_length, sizeof(total_length), bigkey_type->totalsize); + bytesToHuman(length_avg, sizeof(length_avg), (long long)round(avg)); /* better than truncating */ + } else { + snprintf(total_length, sizeof(total_length), "%llu %s", bigkey_type->totalsize, bigkey_type->sizeunit); + snprintf(length_avg, sizeof(length_avg), "%.2f", avg); + } + } + } + /* Print the line for the given Redis type */ + line_count += cleanPrintfln("%-10s %11llu %6.2f%% %8s %8s %18s %11s", + memkey_type->name, memkey_type->count, + sampled ? 100 * (double)memkey_type->count/sampled : 0, + total_size, size_avg, total_length, length_avg); + } + } + dictReleaseIterator(di); + + return line_count; +} + +typedef struct key_info { + unsigned long long size; + char type_name[10]; /* Key type name seems to be 9 char max + \0 */ + sds key_name; +} key_info; + +static int displayKeyStatsTopSizes(list *top_key_sizes, unsigned long top_sizes_limit) { + int line_count = 0, i = 0; + + line_count += cleanPrintfln("--- Top %llu key sizes ---", top_sizes_limit); + char buffer[32]; + listIter *iter = listGetIterator(top_key_sizes, AL_START_HEAD); + listNode *node; + while ((node = listNext(iter)) != NULL) { + key_info *key = (key_info*) listNodeValue(node); + line_count += cleanPrintfln("%3d %8s %-10s %s", ++i, bytesToHuman(buffer, sizeof(buffer), key->size), + key->type_name, key->key_name); + } + listReleaseIterator(iter); + + return line_count; +} + +static key_info *createKeySizeInfo(char *key_name, size_t key_name_len, char *key_type, unsigned long long size) { + key_info *key = zmalloc(sizeof(key_info)); + key->size = size; + snprintf(key->type_name, sizeof(key->type_name), "%s", key_type); + key->key_name = sdscatrepr(sdsempty(), key_name, key_name_len); + if (!key->key_name) { + fprintf(stderr, "Failed to allocate memory for key name.\n"); + exit(1); + } + return key; +} + +/* Insert key info in topkeys sorted by size (from high to low size). + * Keep a maximum of config.top_sizes_limit items in topkeys list. + * key_name and type_name are copied. + * Return: 0 size was not added (too small), 1 size was inserted. */ +static int updateTopSizes(char *key_name, size_t key_name_len, unsigned long long key_size, + char *type_name, list *topkeys, unsigned long top_sizes_limit) +{ + listNode *node; + listIter *iter; + key_info *new_node; + + /* Check if we do not need to add to the list */ + if (top_sizes_limit != 0 && + topkeys->len == top_sizes_limit && + key_size <= ((key_info*)topkeys->tail->value)->size){ + return 0; + } + + /* Find where to insert the new key size */ + iter = listGetIterator(topkeys, AL_START_HEAD); + do { + node = listNext(iter); + } while (node != NULL && key_size <= ((key_info*)node->value)->size); + listReleaseIterator(iter); + + new_node = createKeySizeInfo(key_name, key_name_len, type_name, key_size); + if (node) { + /* Insert before the node */ + listInsertNode(topkeys, node, new_node, 0); + } else { + /* Insert as the last node */ + listAddNodeTail(topkeys, new_node); + } + + /* Trim to stay within the limit */ + if (topkeys->len == top_sizes_limit + 1) { + sdsfree(((key_info*)topkeys->tail->value)->key_name); + listDelNode(topkeys, topkeys->tail); /* list->free is set */ + } + + return 1; +} + +static void displayKeyStats(unsigned long long sampled, unsigned long long total_keys, + unsigned long long total_size, dict *memkeys_types_dict, + dict *bigkeys_types_dict, list *top_key_sizes, + unsigned long top_sizes_limit, int move_cursor_up) +{ + int line_count = 0; + char buf[256]; + + line_count += displayKeyStatsProgressbar(sampled, total_keys); + line_count += cleanPrintfln("Keys size: %s", bytesToHuman(buf, sizeof(buf), total_size)); + line_count += cleanPrintfln(""); + line_count += displayKeyStatsTopSizes(top_key_sizes, top_sizes_limit); + line_count += cleanPrintfln(""); + line_count += displayKeyStatsSizeType(memkeys_types_dict); + line_count += cleanPrintfln(""); + line_count += displayKeyStatsLengthType(bigkeys_types_dict); + + /* If we need to refresh the stats */ + if (move_cursor_up) { + printf("\033[%dA\r", line_count); + } + + fflush(stdout); +} + +static void updateKeyType(redisReply *element, unsigned long long size, typeinfo *type) { + type->totalsize += size; + type->count++; + + if (type->biggestbiggest_key) + sdsfree(type->biggest_key); + type->biggest_key = sdsnewlen(element->str, element->len); + if (!type->biggest_key) { + fprintf(stderr, "Failed to allocate memory for key!\n"); + exit(1); + } + /* Keep track of the biggest size for this type */ + type->biggest = size; + } +} + +static void keyStats(long long memkeys_samples, unsigned long long cursor, unsigned long top_sizes_limit) { + unsigned long long sampled = 0, total_keys, total_size = 0, it = 0, scan_loops = 0; + unsigned long long *memkeys_sizes = NULL, *bigkeys_sizes = NULL; + redisReply *reply, *keys; + unsigned int array_size = 0, i; + typeinfo **memkeys_types = NULL, **bigkeys_types = NULL; + list *top_sizes; + long long refresh_time = mstime(); + + if (cursor != 0) { + it = cursor; + } + + if ((top_sizes = listCreate()) == NULL) { + fprintf(stderr, "top_sizes list creation failed.\n"); + exit(1); + } + top_sizes->free = zfree; + + dict *memkeys_types_dict = dictCreate(&typeinfoDictType); + typeinfo_add(memkeys_types_dict, "string", &type_string); + typeinfo_add(memkeys_types_dict, "list", &type_list); + typeinfo_add(memkeys_types_dict, "set", &type_set); + typeinfo_add(memkeys_types_dict, "hash", &type_hash); + typeinfo_add(memkeys_types_dict, "zset", &type_zset); + typeinfo_add(memkeys_types_dict, "stream", &type_stream); + + /* We could use only one typeinfo dictionary if we add new fields to save + * both memkey and bigkey info. Not sure it would make sense in findBigKeys(). */ + dict *bigkeys_types_dict = dictCreate(&typeinfoDictType); + typeinfo_add(bigkeys_types_dict, "string", &type_string); + typeinfo_add(bigkeys_types_dict, "list", &type_list); + typeinfo_add(bigkeys_types_dict, "set", &type_set); + typeinfo_add(bigkeys_types_dict, "hash", &type_hash); + typeinfo_add(bigkeys_types_dict, "zset", &type_zset); + typeinfo_add(bigkeys_types_dict, "stream", &type_stream); + + size_dist key_length_dist; + size_dist_entry distribution[] = { + {1<<5, 0}, /* 32 B (sds) */ + {1<<8, 0}, /* 256 B (sds) */ + {1<<16, 0}, /* 64 KB (sds and Redis Enterprise key name max length) */ + {1024*1024, 0}, /* 1 MB */ + {16*1024*1024, 0}, /* 16 MB */ + {128*1024*1024, 0}, /* 128 MB */ + {512*1024*1024, 0}, /* 512 MB (max String size) */ + {0, 0}, /* Sizes above the last entry */ + }; + sizeDistInit(&key_length_dist, distribution); + + struct hdr_histogram *keysize_histogram; + /* Record max of 1TB for a key size should cover all keys. + * significant_figures == 4 (0.01% precision on key size) */ + if (hdr_init(1, 1ULL*1024*1024*1024*1024, 4, &keysize_histogram)) { + fprintf(stderr, "Keystats hdr init error\n"); + exit(1); + } + + signal(SIGINT, longStatLoopModeStop); + + /* Total keys pre scanning */ + total_keys = getDbSize(); + + /* Status message */ + printf("\n# Scanning the entire keyspace to find the biggest keys and distribution information.\n"); + printf("# Use -i 0.1 to sleep 0.1 sec per 100 SCAN commands (not usually needed).\n"); + printf("# Use --cursor to start the scan at the cursor (usually after a Ctrl-C).\n"); + printf("# Use --top to display top key sizes (default is 10).\n"); + printf("# Ctrl-C to stop the scan.\n\n"); + + /* Use readonly in cluster */ + sendReadOnly(); + + /* SCAN loop */ + do { + /* Grab some keys and point to the keys array */ + reply = sendScan(&it); + scan_loops++; + keys = reply->element[1]; + + /* Reallocate our type and size array if we need to */ + if (keys->elements > array_size) { + memkeys_types = zrealloc(memkeys_types, sizeof(typeinfo*)*keys->elements); + memkeys_sizes = zrealloc(memkeys_sizes, sizeof(unsigned long long)*keys->elements); + + bigkeys_types = zrealloc(bigkeys_types, sizeof(typeinfo*)*keys->elements); + bigkeys_sizes = zrealloc(bigkeys_sizes, sizeof(unsigned long long)*keys->elements); + + if (!memkeys_types || !memkeys_sizes || !bigkeys_types || !bigkeys_sizes) { + fprintf(stderr, "Failed to allocate storage for keys!\n"); + exit(1); + } + + array_size = keys->elements; + } + + /* Retrieve types and sizes for memkeys */ + getKeyTypes(memkeys_types_dict, keys, memkeys_types); + getKeySizes(keys, memkeys_types, memkeys_sizes, 1, memkeys_samples); + + /* Retrieve types and sizes for bigkeys */ + getKeyTypes(bigkeys_types_dict, keys, bigkeys_types); + getKeySizes(keys, bigkeys_types, bigkeys_sizes, 0, memkeys_samples); + + for (i=0; ielements; i++) { + /* Skip keys that disappeared between SCAN and TYPE */ + if (!memkeys_types[i] || !bigkeys_types[i]) { + continue; + } + + total_size += memkeys_sizes[i]; + sampled++; + + updateTopSizes(keys->element[i]->str, keys->element[i]->len, memkeys_sizes[i], + memkeys_types[i]->name, top_sizes, top_sizes_limit); + updateKeyType(keys->element[i], memkeys_sizes[i], memkeys_types[i]); + updateKeyType(keys->element[i], bigkeys_sizes[i], bigkeys_types[i]); + + /* Key Size distribution */ + if (hdr_record_value(keysize_histogram, memkeys_sizes[i]) == 0) { + fprintf(stderr, "Value %llu not added in the hdr histogram.\n", memkeys_sizes[i]); + } + + /* Key length distribution */ + addSizeDist(&key_length_dist, keys->element[i]->len); + } + + /* Refresh keystats info on regular basis */ + if (mstime() > refresh_time + REFRESH_INTERVAL && IS_TTY_OR_FAKETTY()) { + displayKeyStats(sampled, total_keys, total_size, memkeys_types_dict, bigkeys_types_dict, + top_sizes, top_sizes_limit, 1); + refresh_time = mstime(); + } + + /* Sleep if we've been directed to do so */ + if (config.interval && (scan_loops % 100) == 0) { + usleep(config.interval); + } + + freeReplyObject(reply); + } while(force_cancel_loop == 0 && it != 0); + + displayKeyStats(sampled, total_keys, total_size, memkeys_types_dict, bigkeys_types_dict, top_sizes, + top_sizes_limit, 0); + + /* Additional data at the end of the SCAN loop. + * Using cleanPrintfln in case we want to print during the SCAN loop. */ + cleanPrintfln(""); + displayKeyStatsSizeDist(keysize_histogram); + cleanPrintfln(""); + displayKeyStatsLengthDist(&key_length_dist); + cleanPrintfln(""); + displayKeyStatsType(sampled, memkeys_types_dict, bigkeys_types_dict); + + if (it != 0) { + printf("\n"); + printf("Scan interrupted:\n"); + printf("Use 'redis-cli --keystats --cursor %llu' to restart from the last cursor.\n", it); + } + + if (memkeys_types) zfree(memkeys_types); + if (bigkeys_types) zfree(bigkeys_types); + if (memkeys_sizes) zfree(memkeys_sizes); + if (bigkeys_sizes) zfree(bigkeys_sizes); + dictRelease(memkeys_types_dict); + dictRelease(bigkeys_types_dict); + hdr_close(keysize_histogram); + + /* sdsfree before listRelease */ + listIter *iter = listGetIterator(top_sizes, AL_START_HEAD); + listNode *node; + while ((node = listNext(iter)) != NULL) { + key_info *key = (key_info*) listNodeValue(node); + sdsfree(key->key_name); + } + listReleaseIterator(iter); + listRelease(top_sizes); /* list->free is set */ + + exit(0); +} + /*------------------------------------------------------------------------------ * Program main() *--------------------------------------------------------------------------- */ @@ -9753,6 +10576,8 @@ int main(int argc, char **argv) { memset(&config.sslconfig, 0, sizeof(config.sslconfig)); config.conn_info.hostip = sdsnew("127.0.0.1"); config.conn_info.hostport = 6379; + config.connect_timeout.tv_sec = 0; + config.connect_timeout.tv_usec = 0; config.hostsocket = NULL; config.repeat = 1; config.interval = 0; @@ -9782,6 +10607,10 @@ int main(int argc, char **argv) { config.pipe_mode = 0; config.pipe_timeout = REDIS_CLI_DEFAULT_PIPE_TIMEOUT; config.bigkeys = 0; + config.memkeys = 0; + config.keystats = 0; + config.cursor = 0; + config.top_sizes_limit = 10; config.hotkeys = 0; config.stdin_lastarg = 0; config.stdin_tag_arg = 0; @@ -9801,6 +10630,8 @@ int main(int argc, char **argv) { config.no_auth_warning = 0; config.in_multi = 0; config.server_version = NULL; + config.prefer_ipv4 = 0; + config.prefer_ipv6 = 0; config.cluster_manager_command.name = NULL; config.cluster_manager_command.argc = 0; config.cluster_manager_command.argv = NULL; @@ -9918,6 +10749,12 @@ int main(int argc, char **argv) { findBigKeys(1, config.memkeys_samples); } + /* Find big and large keys */ + if (config.keystats) { + if (cliConnect(0) == REDIS_ERR) exit(1); + keyStats(config.memkeys_samples, config.cursor, config.top_sizes_limit); + } + /* Find hot keys */ if (config.hotkeys) { if (cliConnect(0) == REDIS_ERR) exit(1); diff --git a/src/redisassert.c b/src/redisassert.c index 9f7402e7482..fb16bd6a2fe 100644 --- a/src/redisassert.c +++ b/src/redisassert.c @@ -6,7 +6,7 @@ * * ---------------------------------------------------------------------------- * - * Copyright (c) 2021, Andy Pan and Redis Labs + * Copyright (c) 2021, Andy Pan and Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/redisassert.h b/src/redisassert.h index a3f95da091d..a85cac6df7d 100644 --- a/src/redisassert.h +++ b/src/redisassert.h @@ -7,32 +7,11 @@ * * ---------------------------------------------------------------------------- * - * Copyright (c) 2006-2012, Salvatore Sanfilippo + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __REDIS_ASSERT_H__ diff --git a/src/redismodule.h b/src/redismodule.h index 4378126e2b0..8b5d2beb65d 100644 --- a/src/redismodule.h +++ b/src/redismodule.h @@ -959,8 +959,10 @@ typedef struct RedisModuleTypeMethods { REDISMODULE_API void * (*RedisModule_Alloc)(size_t bytes) REDISMODULE_ATTR; REDISMODULE_API void * (*RedisModule_TryAlloc)(size_t bytes) REDISMODULE_ATTR; REDISMODULE_API void * (*RedisModule_Realloc)(void *ptr, size_t bytes) REDISMODULE_ATTR; +REDISMODULE_API void * (*RedisModule_TryRealloc)(void *ptr, size_t bytes) REDISMODULE_ATTR; REDISMODULE_API void (*RedisModule_Free)(void *ptr) REDISMODULE_ATTR; REDISMODULE_API void * (*RedisModule_Calloc)(size_t nmemb, size_t size) REDISMODULE_ATTR; +REDISMODULE_API void * (*RedisModule_TryCalloc)(size_t nmemb, size_t size) REDISMODULE_ATTR; REDISMODULE_API char * (*RedisModule_Strdup)(const char *str) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_GetApi)(const char *, void *) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_CreateCommand)(RedisModuleCtx *ctx, const char *name, RedisModuleCmdFunc cmdfunc, const char *strflags, int firstkey, int lastkey, int keystep) REDISMODULE_ATTR; @@ -968,6 +970,7 @@ REDISMODULE_API RedisModuleCommand *(*RedisModule_GetCommand)(RedisModuleCtx *ct REDISMODULE_API int (*RedisModule_CreateSubcommand)(RedisModuleCommand *parent, const char *name, RedisModuleCmdFunc cmdfunc, const char *strflags, int firstkey, int lastkey, int keystep) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_SetCommandInfo)(RedisModuleCommand *command, const RedisModuleCommandInfo *info) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_SetCommandACLCategories)(RedisModuleCommand *command, const char *ctgrsflags) REDISMODULE_ATTR; +REDISMODULE_API int (*RedisModule_AddACLCategory)(RedisModuleCtx *ctx, const char *name) REDISMODULE_ATTR; REDISMODULE_API void (*RedisModule_SetModuleAttribs)(RedisModuleCtx *ctx, const char *name, int ver, int apiver) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_IsModuleNameBusy)(const char *name) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_WrongArity)(RedisModuleCtx *ctx) REDISMODULE_ATTR; @@ -1250,6 +1253,8 @@ REDISMODULE_API void (*RedisModule_GetRandomBytes)(unsigned char *dst, size_t le REDISMODULE_API void (*RedisModule_GetRandomHexChars)(char *dst, size_t len) REDISMODULE_ATTR; REDISMODULE_API void (*RedisModule_SetDisconnectCallback)(RedisModuleBlockedClient *bc, RedisModuleDisconnectFunc callback) REDISMODULE_ATTR; REDISMODULE_API void (*RedisModule_SetClusterFlags)(RedisModuleCtx *ctx, uint64_t flags) REDISMODULE_ATTR; +REDISMODULE_API unsigned int (*RedisModule_ClusterKeySlot)(RedisModuleString *key) REDISMODULE_ATTR; +REDISMODULE_API const char *(*RedisModule_ClusterCanonicalKeyNameInSlot)(unsigned int slot) REDISMODULE_ATTR; REDISMODULE_API int (*RedisModule_ExportSharedAPI)(RedisModuleCtx *ctx, const char *apiname, void *func) REDISMODULE_ATTR; REDISMODULE_API void * (*RedisModule_GetSharedAPI)(RedisModuleCtx *ctx, const char *apiname) REDISMODULE_ATTR; REDISMODULE_API RedisModuleCommandFilter * (*RedisModule_RegisterCommandFilter)(RedisModuleCtx *ctx, RedisModuleCommandFilterFunc cb, int flags) REDISMODULE_ATTR; @@ -1321,14 +1326,17 @@ static int RedisModule_Init(RedisModuleCtx *ctx, const char *name, int ver, int REDISMODULE_GET_API(Alloc); REDISMODULE_GET_API(TryAlloc); REDISMODULE_GET_API(Calloc); + REDISMODULE_GET_API(TryCalloc); REDISMODULE_GET_API(Free); REDISMODULE_GET_API(Realloc); + REDISMODULE_GET_API(TryRealloc); REDISMODULE_GET_API(Strdup); REDISMODULE_GET_API(CreateCommand); REDISMODULE_GET_API(GetCommand); REDISMODULE_GET_API(CreateSubcommand); REDISMODULE_GET_API(SetCommandInfo); REDISMODULE_GET_API(SetCommandACLCategories); + REDISMODULE_GET_API(AddACLCategory); REDISMODULE_GET_API(SetModuleAttribs); REDISMODULE_GET_API(IsModuleNameBusy); REDISMODULE_GET_API(WrongArity); @@ -1611,6 +1619,8 @@ static int RedisModule_Init(RedisModuleCtx *ctx, const char *name, int ver, int REDISMODULE_GET_API(GetRandomBytes); REDISMODULE_GET_API(GetRandomHexChars); REDISMODULE_GET_API(SetClusterFlags); + REDISMODULE_GET_API(ClusterKeySlot); + REDISMODULE_GET_API(ClusterCanonicalKeyNameInSlot); REDISMODULE_GET_API(ExportSharedAPI); REDISMODULE_GET_API(GetSharedAPI); REDISMODULE_GET_API(RegisterCommandFilter); diff --git a/src/release.c b/src/release.c index adc7e55ddb9..f6619bb1dd9 100644 --- a/src/release.c +++ b/src/release.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ /* Every time the Redis Git SHA1 or Dirty status changes only this small diff --git a/src/replication.c b/src/replication.c index 97e01b64df0..a3d4eb15cbe 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1,31 +1,10 @@ /* Asynchronous replication implementation. * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ @@ -210,6 +189,9 @@ int canFeedReplicaReplBuffer(client *replica) { /* Don't feed replicas that are still waiting for BGSAVE to start. */ if (replica->replstate == SLAVE_STATE_WAIT_BGSAVE_START) return 0; + /* Don't feed replicas that are going to be closed ASAP. */ + if (replica->flags & CLIENT_CLOSE_ASAP) return 0; + return 1; } @@ -610,6 +592,7 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, while((ln = listNext(&li))) { client *monitor = ln->value; addReply(monitor,cmdobj); + updateClientMemUsageAndBucket(monitor); } decrRefCount(cmdobj); } @@ -877,7 +860,7 @@ int startBgsaveForReplication(int mincapa, int req) { retval = rdbSaveToSlavesSockets(req,rsiptr); else { /* Keep the page cache since it'll get used soon */ - retval = rdbSaveBackground(req,server.rdb_filename,rsiptr,RDBFLAGS_KEEP_CACHE); + retval = rdbSaveBackground(req, server.rdb_filename, rsiptr, RDBFLAGS_REPLICATION | RDBFLAGS_KEEP_CACHE); } } else { serverLog(LL_WARNING,"BGSAVE for replication: replication information not available, can't generate the RDB file right now. Try later."); @@ -950,7 +933,11 @@ void syncCommand(client *c) { } if (!strcasecmp(c->argv[1]->ptr,server.replid)) { - replicationUnsetMaster(); + if (server.cluster_enabled) { + clusterPromoteSelfToMaster(); + } else { + replicationUnsetMaster(); + } sds client = catClientInfoString(sdsempty(),c); serverLog(LL_NOTICE, "MASTER MODE enabled (failover request from '%s')",client); @@ -1254,7 +1241,7 @@ void replconfCommand(client *c) { int filter_count, i; sds *filters; if (!(filters = sdssplitargs(c->argv[j+1]->ptr, &filter_count))) { - addReplyErrorFormat(c, "Missing rdb-filter-only values"); + addReplyError(c, "Missing rdb-filter-only values"); return; } /* By default filter out all parts of the rdb */ @@ -1735,7 +1722,7 @@ int slaveIsInHandshakeState(void) { * not, since the byte is indivisible. * * The function is called in two contexts: while we flush the current - * data with emptyDb(), and while we load the new data received as an + * data with emptyData(), and while we load the new data received as an * RDB file from the master. */ void replicationSendNewlineToMaster(void) { static time_t newline_sent; @@ -1746,7 +1733,7 @@ void replicationSendNewlineToMaster(void) { } } -/* Callback used by emptyDb() while flushing away old data to load +/* Callback used by emptyData() while flushing away old data to load * the new dataset received by the master and by discardTempDb() * after loading succeeded or failed. */ void replicationEmptyDbCallback(dict *d) { @@ -2235,6 +2222,10 @@ void readSyncBulkPayload(connection *conn) { "disabled"); bg_unlink(server.rdb_filename); } + + /* If disk-based RDB loading fails, remove the half-loaded dataset. */ + emptyData(-1, empty_db_flags, replicationEmptyDbCallback); + /* Note that there's no point in restarting the AOF on sync failure, it'll be restarted when sync succeeds or replica promoted. */ return; @@ -2249,6 +2240,7 @@ void readSyncBulkPayload(connection *conn) { } zfree(server.repl_transfer_tmpfile); + close(server.repl_transfer_fd); server.repl_transfer_fd = -1; server.repl_transfer_tmpfile = NULL; } @@ -3772,7 +3764,7 @@ void replicationCron(void) { * match the one stored into 'mf_master_offset' state. */ int manual_failover_in_progress = ((server.cluster_enabled && - server.cluster->mf_end) || + clusterManualFailoverTimeLimit()) || server.failover_end_time) && isPausedActionsWithUpdate(PAUSE_ACTION_REPLICA); @@ -4059,12 +4051,10 @@ void abortFailover(const char *err) { * will attempt forever and must be manually aborted. */ void failoverCommand(client *c) { - if (server.cluster_enabled) { - addReplyError(c,"FAILOVER not allowed in cluster mode. " - "Use CLUSTER FAILOVER command instead."); + if (!clusterAllowFailoverCmd(c)) { return; } - + /* Handle special case for abort */ if ((c->argc == 2) && !strcasecmp(c->argv[1]->ptr,"abort")) { if (server.failover_state == NO_FAILOVER) { diff --git a/src/resp_parser.c b/src/resp_parser.c index b92a74cffbf..e20e9c93c6c 100644 --- a/src/resp_parser.c +++ b/src/resp_parser.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2021, Redis Labs Ltd. + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ /* ---------------------------------------------------------------------------------------- diff --git a/src/resp_parser.h b/src/resp_parser.h index 0b5c8e22c9d..9ca5afa4e03 100644 --- a/src/resp_parser.h +++ b/src/resp_parser.h @@ -1,30 +1,9 @@ /* - * Copyright (c) 2021, Redis Labs Ltd. + * Copyright (c) 2021-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef SRC_RESP_PARSER_H_ diff --git a/src/rio.c b/src/rio.c index eaf88d25fc6..9398a3f78d0 100644 --- a/src/rio.c +++ b/src/rio.c @@ -16,7 +16,7 @@ * ---------------------------------------------------------------------------- * * Copyright (c) 2009-2012, Pieter Noordhuis - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-current, Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/rio.h b/src/rio.h index 9dd59d32b12..361d2004c4d 100644 --- a/src/rio.h +++ b/src/rio.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2009-2012, Pieter Noordhuis - * Copyright (c) 2009-2019, Salvatore Sanfilippo + * Copyright (c) 2009-current, Redis Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/script.c b/src/script.c index 6a798a6e143..a19304ab75f 100644 --- a/src/script.c +++ b/src/script.c @@ -1,36 +1,18 @@ /* - * Copyright (c) 2009-2021, Redis Ltd. + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" #include "script.h" #include "cluster.h" +#include +#include + scriptFlag scripts_flags_def[] = { {.flag = SCRIPT_FLAG_NO_WRITES, .str = "no-writes"}, {.flag = SCRIPT_FLAG_ALLOW_OOM, .str = "allow-oom"}, @@ -60,6 +42,63 @@ static void enterScriptTimedoutMode(scriptRunCtx *run_ctx) { blockingOperationStarts(); } +#if defined(USE_JEMALLOC) +/* When lua uses jemalloc, pass in luaAlloc as a parameter of lua_newstate. */ +static void *luaAlloc(void *ud, void *ptr, size_t osize, size_t nsize) { + UNUSED(osize); + + unsigned int tcache = (unsigned int)(uintptr_t)ud; + if (nsize == 0) { + zfree_with_flags(ptr, MALLOCX_ARENA(server.lua_arena) | MALLOCX_TCACHE(tcache)); + return NULL; + } else { + return zrealloc_with_flags(ptr, nsize, MALLOCX_ARENA(server.lua_arena) | MALLOCX_TCACHE(tcache)); + } +} + +/* Create a lua interpreter, and use jemalloc as lua memory allocator. */ +lua_State *createLuaState(void) { + /* Every time a lua VM is created, a new private tcache is created for use. + * This private tcache will be destroyed after the lua VM is closed. */ + unsigned int tcache; + size_t sz = sizeof(unsigned int); + int err = je_mallctl("tcache.create", (void *)&tcache, &sz, NULL, 0); + if (err) { + serverLog(LL_WARNING, "Failed creating the lua jemalloc tcache."); + exit(1); + } + + /* We pass tcache as ud so that it is not bound to the server. */ + return lua_newstate(luaAlloc, (void *)(uintptr_t)tcache); +} + +/* Under jemalloc we need to create a new arena for lua to avoid blocking + * defragger. */ +void luaEnvInit(void) { + unsigned int arena; + size_t sz = sizeof(unsigned int); + int err = je_mallctl("arenas.create", (void *)&arena, &sz, NULL, 0); + if (err) { + serverLog(LL_WARNING, "Failed creating the lua jemalloc arena."); + exit(1); + } + server.lua_arena = arena; +} + +#else + +/* Create a lua interpreter and use glibc (default) as lua memory allocator. */ +lua_State *createLuaState(void) { + return lua_open(); +} + +/* There is nothing to set up under glib. */ +void luaEnvInit(void) { + server.lua_arena = UINT_MAX; +} + +#endif + int scriptIsTimedout(void) { return scriptIsRunning() && (curr_run_ctx->flags & SCRIPT_TIMEDOUT); } @@ -209,6 +248,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx, client *engine_client, client *ca run_ctx->c = engine_client; run_ctx->original_client = caller; run_ctx->funcname = funcname; + run_ctx->slot = caller->slot; client *script_client = run_ctx->c; client *curr_client = run_ctx->original_client; @@ -262,6 +302,8 @@ void scriptResetRun(scriptRunCtx *run_ctx) { unprotectClient(run_ctx->original_client); } + run_ctx->slot = -1; + preventCommandPropagation(run_ctx->original_client); /* unset curr_run_ctx so we will know there is no running script */ @@ -429,7 +471,7 @@ static int scriptVerifyClusterState(scriptRunCtx *run_ctx, client *c, client *or c->flags &= ~(CLIENT_READONLY | CLIENT_ASKING); c->flags |= original_c->flags & (CLIENT_READONLY | CLIENT_ASKING); int hashslot = -1; - if (getNodeByQuery(c, c->cmd, c->argv, c->argc, &hashslot, &error_code) != server.cluster->myself) { + if (getNodeByQuery(c, c->cmd, c->argv, c->argc, &hashslot, &error_code) != getMyClusterNode()) { if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { *err = sdsnew( "Script attempted to execute a write command while the " @@ -437,7 +479,22 @@ static int scriptVerifyClusterState(scriptRunCtx *run_ctx, client *c, client *or } else if (error_code == CLUSTER_REDIR_DOWN_STATE) { *err = sdsnew("Script attempted to execute a command while the " "cluster is down"); + } else if (error_code == CLUSTER_REDIR_CROSS_SLOT) { + *err = sdscatfmt(sdsempty(), + "Command '%S' in script attempted to access keys that don't hash to the same slot", + c->cmd->fullname); + } else if (error_code == CLUSTER_REDIR_UNSTABLE) { + /* The request spawns multiple keys in the same slot, + * but the slot is not "stable" currently as there is + * a migration or import in progress. */ + *err = sdscatfmt(sdsempty(), + "Unable to execute command '%S' in script " + "because undeclared keys were accessed during rehashing of the slot", + c->cmd->fullname); + } else if (error_code == CLUSTER_REDIR_DOWN_UNBOUND) { + *err = sdsnew("Script attempted to access a slot not served"); } else { + /* error_code == CLUSTER_REDIR_MOVED || error_code == CLUSTER_REDIR_ASK */ *err = sdsnew("Script attempted to access a non local key in a " "cluster node"); } @@ -448,14 +505,18 @@ static int scriptVerifyClusterState(scriptRunCtx *run_ctx, client *c, client *or * already been thrown. This is only checking for cross slot keys being accessed * that weren't pre-declared. */ if (hashslot != -1 && !(run_ctx->flags & SCRIPT_ALLOW_CROSS_SLOT)) { - if (original_c->slot == -1) { - original_c->slot = hashslot; - } else if (original_c->slot != hashslot) { + if (run_ctx->slot == -1) { + run_ctx->slot = hashslot; + } else if (run_ctx->slot != hashslot) { *err = sdsnew("Script attempted to access keys that do not hash to " "the same slot"); return C_ERR; } } + + c->slot = hashslot; + original_c->slot = hashslot; + return C_OK; } diff --git a/src/script.h b/src/script.h index c487165d66c..8d604e493de 100644 --- a/src/script.h +++ b/src/script.h @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2021, Redis Ltd. + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __SCRIPT_H_ @@ -74,6 +53,7 @@ struct scriptRunCtx { int flags; int repl_flags; monotime start_time; + int slot; }; /* Scripts flags */ @@ -92,6 +72,9 @@ typedef struct scriptFlag { extern scriptFlag scripts_flags_def[]; +void luaEnvInit(void); +lua_State *createLuaState(void); +dict *getLuaScripts(void); uint64_t scriptFlagsToCmdFlags(uint64_t cmd_flags, uint64_t script_flags); int scriptPrepareForRun(scriptRunCtx *r_ctx, client *engine_client, client *caller, const char *funcname, uint64_t script_flags, int ro); void scriptResetRun(scriptRunCtx *r_ctx); diff --git a/src/script_lua.c b/src/script_lua.c index 8cdd80523cc..4f325ba2279 100644 --- a/src/script_lua.c +++ b/src/script_lua.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2021, Redis Ltd. + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "script_lua.h" @@ -51,6 +30,7 @@ static char *libraries_allow_list[] = { "math", "table", "struct", + "os", NULL, }; @@ -602,7 +582,7 @@ static void luaReplyToRedisReply(client *c, client* script_client, lua_State *lu * to push 4 elements to the stack. On failure, return error. * Notice that we need, in the worst case, 4 elements because returning a map might * require push 4 elements to the Lua stack.*/ - addReplyErrorFormat(c, "reached lua stack limit"); + addReplyError(c, "reached lua stack limit"); lua_pop(lua,1); /* pop the element from the stack */ return; } @@ -818,8 +798,17 @@ static robj **luaArgsToRedisArgv(lua_State *lua, int *argc, int *argv_len) { /* We can't use lua_tolstring() for number -> string conversion * since Lua uses a format specifier that loses precision. */ lua_Number num = lua_tonumber(lua,j+1); - obj_len = fpconv_dtoa((double)num, dbuf); - dbuf[obj_len] = '\0'; + /* Integer printing function is much faster, check if we can safely use it. + * Since lua_Number is not explicitly an integer or a double, we need to make an effort + * to convert it as an integer when that's possible, since the string could later be used + * in a context that doesn't support scientific notation (e.g. 1e9 instead of 100000000). */ + long long lvalue; + if (double2ll((double)num, &lvalue)) + obj_len = ll2string(dbuf, sizeof(dbuf), lvalue); + else { + obj_len = fpconv_dtoa((double)num, dbuf); + dbuf[obj_len] = '\0'; + } obj_s = dbuf; } else { obj_s = (char*)lua_tolstring(lua,j+1,&obj_len); @@ -1169,7 +1158,7 @@ static int luaLogCommand(lua_State *lua) { } level = lua_tonumber(lua,-argc); if (level < LL_DEBUG || level > LL_WARNING) { - luaPushError(lua, "Invalid debug level."); + luaPushError(lua, "Invalid log level."); return luaError(lua); } if (level < server.verbosity) return 0; @@ -1232,6 +1221,7 @@ static void luaLoadLibraries(lua_State *lua) { luaLoadLib(lua, LUA_STRLIBNAME, luaopen_string); luaLoadLib(lua, LUA_MATHLIBNAME, luaopen_math); luaLoadLib(lua, LUA_DBLIBNAME, luaopen_debug); + luaLoadLib(lua, LUA_OSLIBNAME, luaopen_os); luaLoadLib(lua, "cjson", luaopen_cjson); luaLoadLib(lua, "struct", luaopen_struct); luaLoadLib(lua, "cmsgpack", luaopen_cmsgpack); @@ -1239,7 +1229,6 @@ static void luaLoadLibraries(lua_State *lua) { #if 0 /* Stuff that we don't load currently, for sandboxing concerns. */ luaLoadLib(lua, LUA_LOADLIBNAME, luaopen_package); - luaLoadLib(lua, LUA_OSLIBNAME, luaopen_os); #endif } diff --git a/src/script_lua.h b/src/script_lua.h index 4c2b34804e5..d04ed4cab1b 100644 --- a/src/script_lua.h +++ b/src/script_lua.h @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2021, Redis Ltd. + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __SCRIPT_LUA_H_ diff --git a/src/sds.c b/src/sds.c index e383e3caae7..53bafffe52c 100644 --- a/src/sds.c +++ b/src/sds.c @@ -1,41 +1,18 @@ /* SDSLib 2.0 -- A C dynamic strings library * - * Copyright (c) 2006-2015, Salvatore Sanfilippo - * Copyright (c) 2015, Oran Agra - * Copyright (c) 2015, Redis Labs, Inc + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include #include #include #include -#include #include +#include "redisassert.h" #include "sds.h" #include "sdsalloc.h" #include "util.h" @@ -349,20 +326,22 @@ sds sdsResize(sds s, size_t size, int would_regrow) { * type. */ int use_realloc = (oldtype==type || (type < oldtype && type > SDS_TYPE_8)); size_t newlen = use_realloc ? oldhdrlen+size+1 : hdrlen+size+1; - int alloc_already_optimal = 0; - #if defined(USE_JEMALLOC) - /* je_nallocx returns the expected allocation size for the newlen. - * We aim to avoid calling realloc() when using Jemalloc if there is no - * change in the allocation size, as it incurs a cost even if the - * allocation size stays the same. */ - alloc_already_optimal = (je_nallocx(newlen, 0) == zmalloc_size(sh)); - #endif - - if (use_realloc && !alloc_already_optimal) { - newsh = s_realloc(sh, newlen); - if (newsh == NULL) return NULL; - s = (char*)newsh+oldhdrlen; - } else if (!alloc_already_optimal) { + + if (use_realloc) { + int alloc_already_optimal = 0; + #if defined(USE_JEMALLOC) + /* je_nallocx returns the expected allocation size for the newlen. + * We aim to avoid calling realloc() when using Jemalloc if there is no + * change in the allocation size, as it incurs a cost even if the + * allocation size stays the same. */ + alloc_already_optimal = (je_nallocx(newlen, 0) == zmalloc_size(sh)); + #endif + if (!alloc_already_optimal) { + newsh = s_realloc(sh, newlen); + if (newsh == NULL) return NULL; + s = (char*)newsh+oldhdrlen; + } + } else { newsh = s_malloc(newlen); if (newsh == NULL) return NULL; memcpy((char*)newsh+hdrlen, s, len); diff --git a/src/sds.h b/src/sds.h index 208eaa210d9..bf31c7610b4 100644 --- a/src/sds.h +++ b/src/sds.h @@ -1,33 +1,10 @@ /* SDSLib 2.0 -- A C dynamic strings library * - * Copyright (c) 2006-2015, Salvatore Sanfilippo - * Copyright (c) 2015, Oran Agra - * Copyright (c) 2015, Redis Labs, Inc + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #ifndef __SDS_H diff --git a/src/sdsalloc.h b/src/sdsalloc.h index a1c5584f047..447cfbf4b62 100644 --- a/src/sdsalloc.h +++ b/src/sdsalloc.h @@ -1,32 +1,10 @@ /* SDSLib 2.0 -- A C dynamic strings library * - * Copyright (c) 2006-2015, Salvatore Sanfilippo - * Copyright (c) 2015, Redis Labs, Inc + * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ /* SDS allocator selection. diff --git a/src/sentinel.c b/src/sentinel.c index 238be905f6d..1d6c5659d9c 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1,31 +1,10 @@ /* Redis Sentinel implementation * - * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -4129,7 +4108,7 @@ NULL else if (!strcasecmp(c->argv[2]->ptr,"get") && c->argc >= 4) sentinelConfigGetCommand(c); else - addReplyError(c, "Only SENTINEL CONFIG GET [ ...]/ SET [ ...] are supported."); + addReplyError(c, "Only SENTINEL CONFIG GET [ ...] / SET [ ...] are supported."); } else if (!strcasecmp(c->argv[1]->ptr,"info-cache")) { /* SENTINEL INFO-CACHE */ if (c->argc < 2) goto numargserr; diff --git a/src/server.c b/src/server.c index 6815aac3b15..11646e25687 100644 --- a/src/server.c +++ b/src/server.c @@ -1,30 +1,9 @@ /* - * Copyright (c) 2009-2016, Salvatore Sanfilippo + * Copyright (c) 2009-Present, Redis Ltd. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2) or the Server Side Public License v1 (SSPLv1). */ #include "server.h" @@ -38,12 +17,15 @@ #include "functions.h" #include "hdr_histogram.h" #include "syscheck.h" +#include "threads_mngr.h" +#include "fmtargs.h" +#include "mstr.h" +#include "ebuckets.h" #include #include #include #include -#include #include #include #include @@ -69,6 +51,12 @@ #include #endif +#ifdef __GNUC__ +#define GNUC_VERSION_STR STRINGIFY(__GNUC__) "." STRINGIFY(__GNUC_MINOR__) "." STRINGIFY(__GNUC_PATCHLEVEL__) +#else +#define GNUC_VERSION_STR "0.0.0" +#endif + /* Our shared "common" objects */ struct sharedObjectsStruct shared; @@ -123,11 +111,13 @@ void serverLogRaw(int level, const char *msg) { int off; struct timeval tv; int role_char; + int daylight_active = 0; pid_t pid = getpid(); gettimeofday(&tv,NULL); struct tm tm; - nolocks_localtime(&tm,tv.tv_sec,server.timezone,server.daylight_active); + atomicGet(server.daylight_active, daylight_active); + nolocks_localtime(&tm,tv.tv_sec,server.timezone,daylight_active); off = strftime(buf,sizeof(buf),"%d %b %Y %H:%M:%S.",&tm); snprintf(buf+off,sizeof(buf)-off,"%03d",(int)tv.tv_usec/1000); if (server.sentinel_mode) { @@ -160,13 +150,9 @@ void _serverLog(int level, const char *fmt, ...) { serverLogRaw(level,msg); } -/* Log a fixed message without printf-alike capabilities, in a way that is - * safe to call from a signal handler. - * - * We actually use this only for signals that are not fatal from the point - * of view of Redis. Signals that are going to kill the server anyway and - * where we need printf-alike features are served by serverLog(). */ -void serverLogFromHandler(int level, const char *msg) { +/* Low level logging from signal handler. Should be used with pre-formatted strings. + See serverLogFromHandler. */ +void serverLogRawFromHandler(int level, const char *msg) { int fd; int log_to_stdout = server.logfile[0] == '\0'; char buf[64]; @@ -176,18 +162,41 @@ void serverLogFromHandler(int level, const char *msg) { fd = log_to_stdout ? STDOUT_FILENO : open(server.logfile, O_APPEND|O_CREAT|O_WRONLY, 0644); if (fd == -1) return; - ll2string(buf,sizeof(buf),getpid()); - if (write(fd,buf,strlen(buf)) == -1) goto err; - if (write(fd,":signal-handler (",17) == -1) goto err; - ll2string(buf,sizeof(buf),time(NULL)); - if (write(fd,buf,strlen(buf)) == -1) goto err; - if (write(fd,") ",2) == -1) goto err; - if (write(fd,msg,strlen(msg)) == -1) goto err; - if (write(fd,"\n",1) == -1) goto err; + if (level & LL_RAW) { + if (write(fd,msg,strlen(msg)) == -1) goto err; + } + else { + ll2string(buf,sizeof(buf),getpid()); + if (write(fd,buf,strlen(buf)) == -1) goto err; + if (write(fd,":signal-handler (",17) == -1) goto err; + ll2string(buf,sizeof(buf),time(NULL)); + if (write(fd,buf,strlen(buf)) == -1) goto err; + if (write(fd,") ",2) == -1) goto err; + if (write(fd,msg,strlen(msg)) == -1) goto err; + if (write(fd,"\n",1) == -1) goto err; + } err: if (!log_to_stdout) close(fd); } +/* An async-signal-safe version of serverLog. if LL_RAW is not included in level flags, + * The message format is: :signal-handler (