diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e693da5c63ed1..2266b84ef3560 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -236,10 +236,7 @@ variables: # Start aws ssm variables # They must be defined as environment variables in the GitLab CI/CD settings, to ease rotation if needed - AGENT_QA_PROFILE: ci.datadog-agent.agent-qa-profile # agent-devx-infra API_KEY_ORG2: ci.datadog-agent.datadog_api_key_org2 # agent-devx-infra - API_KEY_DDDEV: ci.datadog-agent.datadog_api_key # agent-devx-infra - APP_KEY_ORG2: ci.datadog-agent.datadog_app_key_org2 # agent-devx-infra CHANGELOG_COMMIT_SHA: ci.datadog-agent.gitlab_changelog_commit_sha # agent-devx-infra CHOCOLATEY_API_KEY: ci.datadog-agent.chocolatey_api_key # windows-agent CODECOV_TOKEN: ci.datadog-agent.codecov_token # agent-devx-infra @@ -247,47 +244,8 @@ variables: DEB_SIGNING_PASSPHRASE: ci.datadog-agent.deb_signing_key_passphrase_${DEB_GPG_KEY_ID} # agent-delivery DOCKER_REGISTRY_LOGIN: ci.datadog-agent.docker_hub_login # container-integrations DOCKER_REGISTRY_PWD: ci.datadog-agent.docker_hub_pwd # container-integrations - E2E_TESTS_API_KEY: ci.datadog-agent.e2e_tests_api_key # agent-devx-loops - E2E_TESTS_APP_KEY: ci.datadog-agent.e2e_tests_app_key # agent-devx-loops - E2E_TESTS_RC_KEY: ci.datadog-agent.e2e_tests_rc_key # agent-devx-loops - E2E_TESTS_AZURE_CLIENT_ID: ci.datadog-agent.e2e_tests_azure_client_id # agent-devx-loops - E2E_TESTS_AZURE_CLIENT_SECRET: ci.datadog-agent.e2e_tests_azure_client_secret # agent-devx-loops - E2E_TESTS_AZURE_TENANT_ID: ci.datadog-agent.e2e_tests_azure_tenant_id # agent-devx-loops - E2E_TESTS_AZURE_SUBSCRIPTION_ID: ci.datadog-agent.e2e_tests_azure_subscription_id # agent-devx-loops - E2E_TESTS_GCP_CREDENTIALS: ci.datadog-agent.e2e_tests_gcp_credentials # agent-devx-loops - KITCHEN_EC2_SSH_KEY: ci.datadog-agent.aws_ec2_kitchen_ssh_key # agent-devx-loops - KITCHEN_AZURE_CLIENT_ID: ci.datadog-agent.azure_kitchen_client_id # agent-devx-loops - KITCHEN_AZURE_CLIENT_SECRET: ci.datadog-agent.azure_kitchen_client_secret # agent-devx-loops - KITCHEN_AZURE_SUBSCRIPTION_ID: ci.datadog-agent.azure_kitchen_subscription_id # agent-devx-loops - KITCHEN_AZURE_TENANT_ID: ci.datadog-agent.azure_kitchen_tenant_id # agent-devx-loops - GITHUB_PR_COMMENTER_APP_KEY: pr-commenter.github_app_key # agent-devx-infra - GITHUB_PR_COMMENTER_INTEGRATION_ID: pr-commenter.github_integration_id # agent-devx-infra - GITHUB_PR_COMMENTER_INSTALLATION_ID: pr-commenter.github_installation_id # agent-devx-infra - GITLAB_SCHEDULER_TOKEN: ci.datadog-agent.gitlab_pipelines_scheduler_token # ci-cd - GITLAB_READ_API_TOKEN: ci.datadog-agent.gitlab_read_api_token # ci-cd - GITLAB_FULL_API_TOKEN: ci.datadog-agent.gitlab_full_api_token # ci-cd - INSTALL_SCRIPT_API_KEY: ci.agent-linux-install-script.datadog_api_key_2 # agent-delivery - JIRA_READ_API_TOKEN: ci.datadog-agent.jira_read_api_token # agent-devx-infra - AGENT_GITHUB_APP_ID: ci.datadog-agent.platform-github-app-id # agent-devx-infra - AGENT_GITHUB_INSTALLATION_ID: ci.datadog-agent.platform-github-app-installation-id # agent-devx-infra - AGENT_GITHUB_KEY: ci.datadog-agent.platform-github-app-key # agent-devx-infra - MACOS_GITHUB_APP_ID: ci.datadog-agent.macos_github_app_id # agent-devx-infra - MACOS_GITHUB_INSTALLATION_ID: ci.datadog-agent.macos_github_installation_id # agent-devx-infra - MACOS_GITHUB_KEY: ci.datadog-agent.macos_github_key_b64 # agent-devx-infra - MACOS_GITHUB_APP_ID_2: ci.datadog-agent.macos_github_app_id_2 # agent-devx-infra - MACOS_GITHUB_INSTALLATION_ID_2: ci.datadog-agent.macos_github_installation_id_2 # agent-devx-infra - MACOS_GITHUB_KEY_2: ci.datadog-agent.macos_github_key_b64_2 # agent-devx-infra RPM_GPG_KEY: ci.datadog-agent.rpm_signing_private_key_${RPM_GPG_KEY_ID} # agent-delivery RPM_SIGNING_PASSPHRASE: ci.datadog-agent.rpm_signing_key_passphrase_${RPM_GPG_KEY_ID} # agent-delivery - SLACK_AGENT_CI_TOKEN: ci.datadog-agent.slack_agent_ci_token # agent-devx-infra - SMP_ACCOUNT_ID: ci.datadog-agent.single-machine-performance-account-id # single-machine-performance - SMP_AGENT_TEAM_ID: ci.datadog-agent.single-machine-performance-agent-team-id # single-machine-performance - SMP_API: ci.datadog-agent.single-machine-performance-api # single-machine-performance - SMP_BOT_ACCESS_KEY: ci.datadog-agent.single-machine-performance-bot-access-key # single-machine-performance - SMP_BOT_ACCESS_KEY_ID: ci.datadog-agent.single-machine-performance-bot-access-key-id # single-machine-performance - SSH_KEY: ci.datadog-agent.ssh_key # system-probe - SSH_KEY_RSA: ci.datadog-agent.ssh_key_rsa # agent-devx-loops - SSH_PUBLIC_KEY_RSA: ci.datadog-agent.ssh_public_key_rsa # agent-devx-loops VCPKG_BLOB_SAS_URL: ci.datadog-agent-buildimages.vcpkg_blob_sas_url # windows-agent WINGET_PAT: ci.datadog-agent.winget_pat # windows-agent # End aws ssm variables diff --git a/.gitlab/container_build/docker_linux.yml b/.gitlab/container_build/docker_linux.yml index df83b97171cb8..5f5c83c0dce67 100644 --- a/.gitlab/container_build/docker_linux.yml +++ b/.gitlab/container_build/docker_linux.yml @@ -61,7 +61,7 @@ docker_build_agent7: IMAGE: registry.ddbuild.io/ci/datadog-agent/agent BUILD_CONTEXT: Dockerfiles/agent TAG_SUFFIX: -7 - BUILD_ARG: --target test --build-arg PYTHON_VERSION=3 --build-arg DD_AGENT_ARTIFACT=datadog-agent-7*-amd64.tar.xz + BUILD_ARG: --target test --build-arg DD_AGENT_ARTIFACT=datadog-agent-7*-amd64.tar.xz single_machine_performance-amd64-a7: extends: .docker_publish_job_definition @@ -87,7 +87,7 @@ docker_build_agent7_arm64: IMAGE: registry.ddbuild.io/ci/datadog-agent/agent BUILD_CONTEXT: Dockerfiles/agent TAG_SUFFIX: -7 - BUILD_ARG: --target test --build-arg PYTHON_VERSION=3 --build-arg DD_AGENT_ARTIFACT=datadog-agent-7*-arm64.tar.xz + BUILD_ARG: --target test --build-arg DD_AGENT_ARTIFACT=datadog-agent-7*-arm64.tar.xz # build agent7 jmx image docker_build_agent7_jmx: @@ -101,7 +101,7 @@ docker_build_agent7_jmx: IMAGE: registry.ddbuild.io/ci/datadog-agent/agent BUILD_CONTEXT: Dockerfiles/agent TAG_SUFFIX: -7-jmx - BUILD_ARG: --target test --build-arg WITH_JMX=true --build-arg PYTHON_VERSION=3 --build-arg DD_AGENT_ARTIFACT=datadog-agent-7*-amd64.tar.xz + BUILD_ARG: --target test --build-arg WITH_JMX=true --build-arg DD_AGENT_ARTIFACT=datadog-agent-7*-amd64.tar.xz docker_build_agent7_jmx_arm64: extends: [.docker_build_job_definition_arm64, .docker_build_artifact] @@ -114,7 +114,7 @@ docker_build_agent7_jmx_arm64: IMAGE: registry.ddbuild.io/ci/datadog-agent/agent BUILD_CONTEXT: Dockerfiles/agent TAG_SUFFIX: -7-jmx - BUILD_ARG: --target test --build-arg WITH_JMX=true --build-arg PYTHON_VERSION=3 --build-arg DD_AGENT_ARTIFACT=datadog-agent-7*-arm64.tar.xz + BUILD_ARG: --target test --build-arg WITH_JMX=true --build-arg DD_AGENT_ARTIFACT=datadog-agent-7*-arm64.tar.xz # build agent7 UA image docker_build_ot_agent7: @@ -128,7 +128,7 @@ docker_build_ot_agent7: IMAGE: registry.ddbuild.io/ci/datadog-agent/agent BUILD_CONTEXT: Dockerfiles/agent TAG_SUFFIX: -7-ot-beta - BUILD_ARG: --target test --build-arg PYTHON_VERSION=3 --build-arg DD_AGENT_ARTIFACT=datadog-ot-agent-7*-amd64.tar.xz + BUILD_ARG: --target test --build-arg DD_AGENT_ARTIFACT=datadog-ot-agent-7*-amd64.tar.xz docker_build_ot_agent7_arm64: extends: [.docker_build_job_definition_arm64, .docker_build_artifact] @@ -141,7 +141,7 @@ docker_build_ot_agent7_arm64: IMAGE: registry.ddbuild.io/ci/datadog-agent/agent BUILD_CONTEXT: Dockerfiles/agent TAG_SUFFIX: -7-ot-beta - BUILD_ARG: --target test --build-arg PYTHON_VERSION=3 --build-arg DD_AGENT_ARTIFACT=datadog-ot-agent-7*-arm64.tar.xz + BUILD_ARG: --target test --build-arg DD_AGENT_ARTIFACT=datadog-ot-agent-7*-arm64.tar.xz # build agent7 jmx image docker_build_ot_agent7_jmx: @@ -155,7 +155,7 @@ docker_build_ot_agent7_jmx: IMAGE: registry.ddbuild.io/ci/datadog-agent/agent BUILD_CONTEXT: Dockerfiles/agent TAG_SUFFIX: -7-ot-beta-jmx - BUILD_ARG: --target test --build-arg WITH_JMX=true --build-arg PYTHON_VERSION=3 --build-arg DD_AGENT_ARTIFACT=datadog-ot-agent-7*-amd64.tar.xz + BUILD_ARG: --target test --build-arg WITH_JMX=true --build-arg DD_AGENT_ARTIFACT=datadog-ot-agent-7*-amd64.tar.xz docker_build_ot_agent7_jmx_arm64: extends: [.docker_build_job_definition_arm64, .docker_build_artifact] @@ -168,7 +168,7 @@ docker_build_ot_agent7_jmx_arm64: IMAGE: registry.ddbuild.io/ci/datadog-agent/agent BUILD_CONTEXT: Dockerfiles/agent TAG_SUFFIX: -7-ot-beta-jmx - BUILD_ARG: --target test --build-arg WITH_JMX=true --build-arg PYTHON_VERSION=3 --build-arg DD_AGENT_ARTIFACT=datadog-ot-agent-7*-arm64.tar.xz + BUILD_ARG: --target test --build-arg WITH_JMX=true --build-arg DD_AGENT_ARTIFACT=datadog-ot-agent-7*-arm64.tar.xz # build the cluster-agent image docker_build_cluster_agent_amd64: diff --git a/.gitlab/lint/technical_linters.yml b/.gitlab/lint/technical_linters.yml index d1e03b7ee4b02..b84b1ece24671 100644 --- a/.gitlab/lint/technical_linters.yml +++ b/.gitlab/lint/technical_linters.yml @@ -3,6 +3,7 @@ stage: lint image: registry.ddbuild.io/ci/datadog-agent-buildimages/deb_x64$DATADOG_AGENT_BUILDIMAGES_SUFFIX:$DATADOG_AGENT_BUILDIMAGES tags: ["arch:amd64"] + needs: [] lint_licenses: extends: .lint @@ -46,19 +47,16 @@ lint_components: lint_python: extends: .lint - needs: [] script: - inv -e linter.python lint_update_go: extends: .lint - needs: [] script: - inv -e linter.update-go validate_modules: extends: .lint - needs: [] script: - inv -e modules.validate - inv -e modules.validate-used-by-otel diff --git a/.gitlab/source_test/macos.yml b/.gitlab/source_test/macos.yml index 338f2f9eca347..63d660ce8bf7c 100644 --- a/.gitlab/source_test/macos.yml +++ b/.gitlab/source_test/macos.yml @@ -22,7 +22,7 @@ tests_macos: - python3 -m pip install -r tasks/libs/requirements-github.txt --break-system-packages - FAST_TESTS_FLAG="" - if [[ "$FAST_TESTS" = "true" ]]; then FAST_TESTS_FLAG="--fast-tests true"; fi - - inv -e github.trigger-macos --workflow-type "test" --datadog-agent-ref "$CI_COMMIT_SHA" --version-cache "$VERSION_CACHE_CONTENT" $FAST_TESTS_FLAG --test-washer $COVERAGE_CACHE_FLAG + - inv -e github.trigger-macos --workflow-type "test" --datadog-agent-ref "$CI_COMMIT_SHA" --version-cache "$VERSION_CACHE_CONTENT" $FAST_TESTS_FLAG --test-washer timeout: 6h after_script: - $CI_PROJECT_DIR/tools/ci/junit_upload.sh "junit-*-repacked.tgz" @@ -51,7 +51,7 @@ tests_macos: - inv -e gitlab.generate-ci-visibility-links --output=$EXTERNAL_LINKS_PATH - FAST_TESTS_FLAG="" - if [[ "$FAST_TESTS" == "true" ]]; then FAST_TESTS_FLAG="--only-impacted-packages"; fi - - inv -e test --rerun-fails=2 --race --profile --cpus 12 --save-result-json $TEST_OUTPUT_FILE --junit-tar "junit-${CI_JOB_NAME}.tgz" $FAST_TESTS_FLAG --test-washer + - inv -e test --rerun-fails=2 --race --profile --cpus 12 --save-result-json $TEST_OUTPUT_FILE --junit-tar "junit-${CI_JOB_NAME}.tgz" $FAST_TESTS_FLAG --test-washer --coverage - inv -e invoke-unit-tests artifacts: expire_in: 2 weeks diff --git a/Dockerfiles/agent/Dockerfile b/Dockerfiles/agent/Dockerfile index 894a29f2d250e..bf99bef8e05a1 100644 --- a/Dockerfiles/agent/Dockerfile +++ b/Dockerfiles/agent/Dockerfile @@ -39,7 +39,6 @@ RUN gcc -pipe -Wall -Wextra -O2 -shared -fPIC -Wl,--version-script=/tmp/nosys.sy FROM baseimage AS extract ARG TARGETARCH ARG WITH_JMX -ARG PYTHON_VERSION ARG DD_AGENT_ARTIFACT=datadog-agent*-$TARGETARCH.tar.xz ARG GENERAL_ARTIFACTS_CACHE_BUCKET_URL @@ -75,25 +74,6 @@ RUN find / -maxdepth 1 -name "${DD_AGENT_ARTIFACT}" -exec tar xvf {} -C . \; \ opt/datadog-agent/embedded/share/man \ # self-test certificates that are detected (false positive) as private keys opt/datadog-agent/embedded/lib/python*/site-packages/future/backports/test \ - && if [ "$PYTHON_VERSION" = "2" ]; then \ - rm -rf \ - opt/datadog-agent/embedded/bin/2to3-3* \ - opt/datadog-agent/embedded/bin/easy_install-3* \ - opt/datadog-agent/embedded/bin/idle* \ - opt/datadog-agent/embedded/bin/pip3* \ - opt/datadog-agent/embedded/bin/pydoc* \ - opt/datadog-agent/embedded/bin/python3* \ - opt/datadog-agent/embedded/bin/pyvenv* \ - opt/datadog-agent/embedded/include/python3* \ - opt/datadog-agent/embedded/lib/*python3* || true ;\ - fi \ - && if [ "$PYTHON_VERSION" = "3" ]; then \ - rm -rf \ - opt/datadog-agent/embedded/bin/pip2* \ - opt/datadog-agent/embedded/bin/python2* \ - opt/datadog-agent/embedded/include/python2* \ - opt/datadog-agent/embedded/lib/*python2* || true ;\ - fi \ && find opt/datadog-agent/ -iname "*.a" -delete \ && if [ -z "$WITH_JMX" ]; then rm -rf opt/datadog-agent/bin/agent/dist/jmx; fi \ && mkdir conf.d checks.d @@ -116,11 +96,9 @@ RUN if [ -n "$WITH_JMX" ]; then cd /opt/bouncycastle-fips && mvn dependency:copy FROM baseimage AS release LABEL maintainer="Datadog " ARG WITH_JMX -ARG PYTHON_VERSION ARG DD_GIT_REPOSITORY_URL ARG DD_GIT_COMMIT_SHA ENV DOCKER_DD_AGENT=true \ - DD_PYTHON_VERSION=$PYTHON_VERSION \ PATH=/opt/datadog-agent/bin/agent/:/opt/datadog-agent/embedded/bin/:$PATH \ CURL_CA_BUNDLE=/opt/datadog-agent/embedded/ssl/certs/cacert.pem \ # Pass envvar variables to agents @@ -202,13 +180,6 @@ RUN tar xzf s6.tgz -C / --exclude="./bin" \ # * https://datadoghq.atlassian.net/wiki/spaces/TS/pages/2615709591/Why+the+containerized+Agent+runs+as+root#Agent-user RUN [ "$(getent passwd dd-agent | cut -d: -f 3)" -eq 100 ] -# Update links to python binaries -RUN if [ -n "$PYTHON_VERSION" ]; then \ - ln -sfn /opt/datadog-agent/embedded/bin/python${PYTHON_VERSION} /opt/datadog-agent/embedded/bin/python \ - && ln -sfn /opt/datadog-agent/embedded/bin/python${PYTHON_VERSION}-config /opt/datadog-agent/embedded/bin/python-config \ - && ln -sfn /opt/datadog-agent/embedded/bin/pip${PYTHON_VERSION} /opt/datadog-agent/embedded/bin/pip ; \ - fi - # Override the exit script by ours to fix --pid=host operations RUN mv /etc/s6/init/init-stage3 /etc/s6/init/init-stage3-original COPY init-stage3 /etc/s6/init/init-stage3 diff --git a/cmd/system-probe/api/module/common.go b/cmd/system-probe/api/module/common.go index 0e1b156553fd7..cec9bf608de45 100644 --- a/cmd/system-probe/api/module/common.go +++ b/cmd/system-probe/api/module/common.go @@ -9,6 +9,7 @@ package module import ( "errors" + tagger "github.com/DataDog/datadog-agent/comp/core/tagger/def" "github.com/DataDog/datadog-agent/comp/core/telemetry" workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" "go.uber.org/fx" @@ -30,5 +31,6 @@ type FactoryDependencies struct { fx.In WMeta workloadmeta.Component + Tagger tagger.Component Telemetry telemetry.Component } diff --git a/cmd/system-probe/api/module/loader.go b/cmd/system-probe/api/module/loader.go index 51048f9d63a0e..5e028b0d538f7 100644 --- a/cmd/system-probe/api/module/loader.go +++ b/cmd/system-probe/api/module/loader.go @@ -16,6 +16,7 @@ import ( "github.com/gorilla/mux" sysconfigtypes "github.com/DataDog/datadog-agent/cmd/system-probe/config/types" + tagger "github.com/DataDog/datadog-agent/comp/core/tagger/def" "github.com/DataDog/datadog-agent/comp/core/telemetry" workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" "github.com/DataDog/datadog-agent/pkg/util/log" @@ -63,7 +64,7 @@ func withModule(name sysconfigtypes.ModuleName, fn func()) { // * Initialization using the provided Factory; // * Registering the HTTP endpoints of each module; // * Register the gRPC server; -func Register(cfg *sysconfigtypes.Config, httpMux *mux.Router, factories []Factory, wmeta workloadmeta.Component, telemetry telemetry.Component) error { +func Register(cfg *sysconfigtypes.Config, httpMux *mux.Router, factories []Factory, wmeta workloadmeta.Component, tagger tagger.Component, telemetry telemetry.Component) error { var enabledModulesFactories []Factory for _, factory := range factories { if !cfg.ModuleIsEnabled(factory.Name) { @@ -83,6 +84,7 @@ func Register(cfg *sysconfigtypes.Config, httpMux *mux.Router, factories []Facto withModule(factory.Name, func() { deps := FactoryDependencies{ WMeta: wmeta, + Tagger: tagger, Telemetry: telemetry, } module, err = factory.Fn(cfg, deps) @@ -143,7 +145,7 @@ func GetStats() map[string]interface{} { } // RestartModule triggers a module restart -func RestartModule(factory Factory, wmeta workloadmeta.Component, telemetry telemetry.Component) error { +func RestartModule(factory Factory, wmeta workloadmeta.Component, tagger tagger.Component, telemetry telemetry.Component) error { l.Lock() defer l.Unlock() @@ -162,6 +164,7 @@ func RestartModule(factory Factory, wmeta workloadmeta.Component, telemetry tele currentModule.Close() deps := FactoryDependencies{ WMeta: wmeta, + Tagger: tagger, Telemetry: telemetry, } newModule, err = factory.Fn(l.cfg, deps) diff --git a/cmd/system-probe/api/restart.go b/cmd/system-probe/api/restart.go index 608f1c014ffe8..fb31c5cc8c684 100644 --- a/cmd/system-probe/api/restart.go +++ b/cmd/system-probe/api/restart.go @@ -14,11 +14,12 @@ import ( "github.com/DataDog/datadog-agent/cmd/system-probe/config" sysconfigtypes "github.com/DataDog/datadog-agent/cmd/system-probe/config/types" "github.com/DataDog/datadog-agent/cmd/system-probe/modules" + tagger "github.com/DataDog/datadog-agent/comp/core/tagger/def" "github.com/DataDog/datadog-agent/comp/core/telemetry" workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" ) -func restartModuleHandler(w http.ResponseWriter, r *http.Request, wmeta workloadmeta.Component, telemetry telemetry.Component) { +func restartModuleHandler(w http.ResponseWriter, r *http.Request, wmeta workloadmeta.Component, tagger tagger.Component, telemetry telemetry.Component) { vars := mux.Vars(r) moduleName := sysconfigtypes.ModuleName(vars["module-name"]) @@ -39,7 +40,7 @@ func restartModuleHandler(w http.ResponseWriter, r *http.Request, wmeta workload return } - err := module.RestartModule(target, wmeta, telemetry) + err := module.RestartModule(target, wmeta, tagger, telemetry) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return diff --git a/cmd/system-probe/api/server.go b/cmd/system-probe/api/server.go index 1e4249eeb50ab..3e4a71056f143 100644 --- a/cmd/system-probe/api/server.go +++ b/cmd/system-probe/api/server.go @@ -21,6 +21,7 @@ import ( "github.com/DataDog/datadog-agent/cmd/system-probe/modules" "github.com/DataDog/datadog-agent/cmd/system-probe/utils" "github.com/DataDog/datadog-agent/comp/core/settings" + tagger "github.com/DataDog/datadog-agent/comp/core/tagger/def" "github.com/DataDog/datadog-agent/comp/core/telemetry" workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" "github.com/DataDog/datadog-agent/pkg/ebpf" @@ -28,7 +29,7 @@ import ( ) // StartServer starts the HTTP and gRPC servers for the system-probe, which registers endpoints from all enabled modules. -func StartServer(cfg *sysconfigtypes.Config, telemetry telemetry.Component, wmeta workloadmeta.Component, settings settings.Component) error { +func StartServer(cfg *sysconfigtypes.Config, telemetry telemetry.Component, wmeta workloadmeta.Component, tagger tagger.Component, settings settings.Component) error { conn, err := server.NewListener(cfg.SocketAddress) if err != nil { return err @@ -36,7 +37,7 @@ func StartServer(cfg *sysconfigtypes.Config, telemetry telemetry.Component, wmet mux := gorilla.NewRouter() - err = module.Register(cfg, mux, modules.All, wmeta, telemetry) + err = module.Register(cfg, mux, modules.All, wmeta, tagger, telemetry) if err != nil { return fmt.Errorf("failed to create system probe: %s", err) } @@ -49,7 +50,7 @@ func StartServer(cfg *sysconfigtypes.Config, telemetry telemetry.Component, wmet setupConfigHandlers(mux, settings) // Module-restart handler - mux.HandleFunc("/module-restart/{module-name}", func(w http.ResponseWriter, r *http.Request) { restartModuleHandler(w, r, wmeta, telemetry) }).Methods("POST") + mux.HandleFunc("/module-restart/{module-name}", func(w http.ResponseWriter, r *http.Request) { restartModuleHandler(w, r, wmeta, tagger, telemetry) }).Methods("POST") mux.PathPrefix("/debug/pprof").Handler(http.DefaultServeMux) mux.Handle("/debug/vars", http.DefaultServeMux) diff --git a/cmd/system-probe/subcommands/run/command.go b/cmd/system-probe/subcommands/run/command.go index 4459dce978347..ab55fc4287c0a 100644 --- a/cmd/system-probe/subcommands/run/command.go +++ b/cmd/system-probe/subcommands/run/command.go @@ -42,6 +42,9 @@ import ( "github.com/DataDog/datadog-agent/comp/core/settings/settingsimpl" "github.com/DataDog/datadog-agent/comp/core/sysprobeconfig" "github.com/DataDog/datadog-agent/comp/core/sysprobeconfig/sysprobeconfigimpl" + tagger "github.com/DataDog/datadog-agent/comp/core/tagger/def" + remoteTaggerFx "github.com/DataDog/datadog-agent/comp/core/tagger/fx-remote" + taggerTypes "github.com/DataDog/datadog-agent/comp/core/tagger/types" "github.com/DataDog/datadog-agent/comp/core/telemetry" "github.com/DataDog/datadog-agent/comp/core/telemetry/telemetryimpl" wmcatalog "github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/catalog" @@ -50,6 +53,7 @@ import ( compstatsd "github.com/DataDog/datadog-agent/comp/dogstatsd/statsd" "github.com/DataDog/datadog-agent/comp/remote-config/rcclient" "github.com/DataDog/datadog-agent/comp/remote-config/rcclient/rcclientimpl" + "github.com/DataDog/datadog-agent/pkg/api/security" "github.com/DataDog/datadog-agent/pkg/config/env" "github.com/DataDog/datadog-agent/pkg/config/model" commonsettings "github.com/DataDog/datadog-agent/pkg/config/settings" @@ -111,6 +115,14 @@ func Commands(globalParams *command.GlobalParams) []*cobra.Command { workloadmetafx.Module(workloadmeta.Params{ AgentType: workloadmeta.Remote, }), + // Provide tagger module + remoteTaggerFx.Module(tagger.RemoteParams{ + RemoteTarget: func(c config.Component) (string, error) { return fmt.Sprintf(":%v", c.GetInt("cmd_port")), nil }, + RemoteTokenFetcher: func(c config.Component) func() (string, error) { + return func() (string, error) { return security.FetchAuthToken(c) } + }, + RemoteFilter: taggerTypes.NewMatchAllFilter(), + }), autoexitimpl.Module(), pidimpl.Module(), fx.Supply(pidimpl.NewParams(cliParams.pidfilePath)), @@ -140,7 +152,7 @@ func Commands(globalParams *command.GlobalParams) []*cobra.Command { } // run starts the main loop. -func run(log log.Component, _ config.Component, statsd compstatsd.Component, telemetry telemetry.Component, sysprobeconfig sysprobeconfig.Component, rcclient rcclient.Component, wmeta workloadmeta.Component, _ pid.Component, _ healthprobe.Component, _ autoexit.Component, settings settings.Component) error { +func run(log log.Component, _ config.Component, statsd compstatsd.Component, telemetry telemetry.Component, sysprobeconfig sysprobeconfig.Component, rcclient rcclient.Component, wmeta workloadmeta.Component, tagger tagger.Component, _ pid.Component, _ healthprobe.Component, _ autoexit.Component, settings settings.Component) error { defer func() { stopSystemProbe() }() @@ -182,7 +194,7 @@ func run(log log.Component, _ config.Component, statsd compstatsd.Component, tel } }() - if err := startSystemProbe(log, statsd, telemetry, sysprobeconfig, rcclient, wmeta, settings); err != nil { + if err := startSystemProbe(log, statsd, telemetry, sysprobeconfig, rcclient, wmeta, tagger, settings); err != nil { if errors.Is(err, ErrNotEnabled) { // A sleep is necessary to ensure that supervisor registers this process as "STARTED" // If the exit is "too quick", we enter a BACKOFF->FATAL loop even though this is an expected exit @@ -226,9 +238,9 @@ func StartSystemProbeWithDefaults(ctxChan <-chan context.Context) (<-chan error, func runSystemProbe(ctxChan <-chan context.Context, errChan chan error) error { return fxutil.OneShot( - func(log log.Component, _ config.Component, statsd compstatsd.Component, telemetry telemetry.Component, sysprobeconfig sysprobeconfig.Component, rcclient rcclient.Component, wmeta workloadmeta.Component, _ healthprobe.Component, settings settings.Component) error { + func(log log.Component, _ config.Component, statsd compstatsd.Component, telemetry telemetry.Component, sysprobeconfig sysprobeconfig.Component, rcclient rcclient.Component, wmeta workloadmeta.Component, tagger tagger.Component, _ healthprobe.Component, settings settings.Component) error { defer StopSystemProbeWithDefaults() - err := startSystemProbe(log, statsd, telemetry, sysprobeconfig, rcclient, wmeta, settings) + err := startSystemProbe(log, statsd, telemetry, sysprobeconfig, rcclient, wmeta, tagger, settings) if err != nil { return err } @@ -273,6 +285,14 @@ func runSystemProbe(ctxChan <-chan context.Context, errChan chan error) error { workloadmetafx.Module(workloadmeta.Params{ AgentType: workloadmeta.Remote, }), + // Provide tagger module + remoteTaggerFx.Module(tagger.RemoteParams{ + RemoteTarget: func(c config.Component) (string, error) { return fmt.Sprintf(":%v", c.GetInt("cmd_port")), nil }, + RemoteTokenFetcher: func(c config.Component) func() (string, error) { + return func() (string, error) { return security.FetchAuthToken(c) } + }, + RemoteFilter: taggerTypes.NewMatchAllFilter(), + }), systemprobeloggerfx.Module(), fx.Provide(func(sysprobeconfig sysprobeconfig.Component) settings.Params { profilingGoRoutines := commonsettings.NewProfilingGoroutines() @@ -300,7 +320,7 @@ func StopSystemProbeWithDefaults() { } // startSystemProbe Initializes the system-probe process -func startSystemProbe(log log.Component, statsd compstatsd.Component, telemetry telemetry.Component, sysprobeconfig sysprobeconfig.Component, _ rcclient.Component, wmeta workloadmeta.Component, settings settings.Component) error { +func startSystemProbe(log log.Component, statsd compstatsd.Component, telemetry telemetry.Component, sysprobeconfig sysprobeconfig.Component, _ rcclient.Component, wmeta workloadmeta.Component, tagger tagger.Component, settings settings.Component) error { var err error cfg := sysprobeconfig.SysProbeObject() @@ -360,7 +380,7 @@ func startSystemProbe(log log.Component, statsd compstatsd.Component, telemetry }() } - if err = api.StartServer(cfg, telemetry, wmeta, settings); err != nil { + if err = api.StartServer(cfg, telemetry, wmeta, tagger, settings); err != nil { return log.Criticalf("error while starting api server, exiting: %v", err) } return nil diff --git a/comp/core/flare/helpers/builder.go b/comp/core/flare/helpers/builder.go index 53ac0c4e12f3e..00277621b5c31 100644 --- a/comp/core/flare/helpers/builder.go +++ b/comp/core/flare/helpers/builder.go @@ -329,6 +329,11 @@ func (fb *builder) copyDirTo(shouldScrub bool, srcDir string, destDir string, sh if err != nil { return fb.logError("error getting absolute path for '%s': %s", srcDir, err) } + + if isLocal := filepath.IsLocal(destDir); !isLocal { + return fb.logError("the destination path is not local to the flare root path: %s", destDir) + } + fb.RegisterFilePerm(srcDir) err = filepath.Walk(srcDir, func(src string, f os.FileInfo, _ error) error { @@ -372,6 +377,10 @@ func (fb *builder) prepareFilePath(path string) (string, error) { return "", errors.New("flare builder is already closed") } + if isLocal := filepath.IsLocal(path); !isLocal { + return "", fb.logError("the destination path is not local to the flare root path: %s", path) + } + p := filepath.Join(fb.flareDir, path) err := os.MkdirAll(filepath.Dir(p), os.ModePerm) diff --git a/comp/core/flare/helpers/builder_test.go b/comp/core/flare/helpers/builder_test.go index fecee10bcbd26..26508575eef92 100644 --- a/comp/core/flare/helpers/builder_test.go +++ b/comp/core/flare/helpers/builder_test.go @@ -136,6 +136,37 @@ func TestAddFile(t *testing.T) { assertFileContent(t, fb, "api_key: \"********\"", "test/AddFile_scrubbed_api_key") } +func TestAddNonLocalFileFlare(t *testing.T) { + fb := getNewBuilder(t) + defer fb.clean() + + expectedError := "the destination path is not local to the flare root path" + + err := fb.AddFile(FromSlash("../test/AddFile"), []byte{}) + assert.ErrorContains(t, err, expectedError) + + err = fb.AddFileWithoutScrubbing(FromSlash("../test/AddFile"), []byte{}) + assert.ErrorContains(t, err, expectedError) + + err = fb.AddFileFromFunc(FromSlash("../test/AddFile"), func() ([]byte, error) { return []byte{}, nil }) + assert.ErrorContains(t, err, expectedError) + + path := filepath.Join(t.TempDir(), "test.data") + os.WriteFile(path, []byte("some data"), os.ModePerm) + err = fb.CopyFileTo(path, FromSlash("../test/AddFile")) + assert.ErrorContains(t, err, expectedError) + + root := setupDirWithData(t) + err = fb.CopyDirTo(root, "../test", func(string) bool { return true }) + assert.ErrorContains(t, err, expectedError) + + err = fb.CopyDirToWithoutScrubbing(root, "../test", func(string) bool { return true }) + assert.ErrorContains(t, err, expectedError) + + _, err = fb.PrepareFilePath("../test") + assert.ErrorContains(t, err, expectedError) +} + func TestAddFileWithoutScrubbing(t *testing.T) { fb := getNewBuilder(t) defer fb.clean() diff --git a/comp/trace/config/config_test.go b/comp/trace/config/config_test.go index 8649b7f142cb2..7bdc5950e9cbf 100644 --- a/comp/trace/config/config_test.go +++ b/comp/trace/config/config_test.go @@ -953,6 +953,19 @@ func TestLoadEnv(t *testing.T) { assert.Equal(t, 12.3, cfg.OTLPReceiver.ProbabilisticSampling) }) + env = "DD_APM_ERROR_TRACKING_STANDALONE_ENABLED" + t.Run(env, func(t *testing.T) { + t.Setenv(env, "true") + + config := buildConfigComponent(t, true, fx.Replace(corecomp.MockParams{ + Params: corecomp.Params{ConfFilePath: "./testdata/undocumented.yaml"}, + })) + cfg := config.Object() + + assert.NotNil(t, cfg) + assert.Equal(t, true, cfg.ErrorTrackingStandalone) + }) + for _, envKey := range []string{ "DD_IGNORE_RESOURCE", // deprecated "DD_APM_IGNORE_RESOURCES", diff --git a/comp/trace/config/setup.go b/comp/trace/config/setup.go index 00a37fffc3315..70b0b3ac6355a 100644 --- a/comp/trace/config/setup.go +++ b/comp/trace/config/setup.go @@ -278,6 +278,10 @@ func applyDatadogConfig(c *config.AgentConfig, core corecompcfg.Component) error c.ProbabilisticSamplerHashSeed = uint32(core.GetInt("apm_config.probabilistic_sampler.hash_seed")) } + if core.IsSet("apm_config.error_tracking_standalone.enabled") { + c.ErrorTrackingStandalone = core.GetBool("apm_config.error_tracking_standalone.enabled") + } + if core.IsSet("apm_config.max_remote_traces_per_second") { c.MaxRemoteTPS = core.GetFloat64("apm_config.max_remote_traces_per_second") } diff --git a/pkg/collector/corechecks/servicediscovery/apm/testutil/instrumented2/.gitignore b/pkg/collector/corechecks/servicediscovery/apm/testutil/instrumented2/.gitignore new file mode 100644 index 0000000000000..68892ef7c3c4f --- /dev/null +++ b/pkg/collector/corechecks/servicediscovery/apm/testutil/instrumented2/.gitignore @@ -0,0 +1,5 @@ +# Ignore Everything +* + +# But go files +!*.go diff --git a/pkg/collector/corechecks/servicediscovery/cp_stub.go b/pkg/collector/corechecks/servicediscovery/cp_stub.go deleted file mode 100644 index a3c26578ae6e8..0000000000000 --- a/pkg/collector/corechecks/servicediscovery/cp_stub.go +++ /dev/null @@ -1,44 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2024-present Datadog, Inc. - -//go:build linux && test - -package servicediscovery - -import ( - "time" - - model "github.com/DataDog/agent-payload/v5/process" - - proccontainers "github.com/DataDog/datadog-agent/pkg/process/util/containers" -) - -const ( - dummyContainerID = "abcd" -) - -type containerProviderStub struct { - pidToCid map[int]string -} - -func newContainerProviderStub(targetPIDs []int) proccontainers.ContainerProvider { - pidToCid := make(map[int]string) - - for _, pid := range targetPIDs { - pidToCid[pid] = dummyContainerID - } - - return &containerProviderStub{ - pidToCid: pidToCid, - } -} - -func (*containerProviderStub) GetContainers(_ time.Duration, _ map[string]*proccontainers.ContainerRateMetrics) ([]*model.Container, map[string]*proccontainers.ContainerRateMetrics, map[int]string, error) { - return nil, nil, nil, nil -} - -func (s *containerProviderStub) GetPidToCid(_ time.Duration) map[int]string { - return s.pidToCid -} diff --git a/pkg/collector/corechecks/servicediscovery/impl_linux.go b/pkg/collector/corechecks/servicediscovery/impl_linux.go index a8c737762c249..30f4230425349 100644 --- a/pkg/collector/corechecks/servicediscovery/impl_linux.go +++ b/pkg/collector/corechecks/servicediscovery/impl_linux.go @@ -14,7 +14,6 @@ import ( "github.com/DataDog/datadog-agent/pkg/collector/corechecks/servicediscovery/servicetype" pkgconfigsetup "github.com/DataDog/datadog-agent/pkg/config/setup" processnet "github.com/DataDog/datadog-agent/pkg/process/net" - proccontainers "github.com/DataDog/datadog-agent/pkg/process/util/containers" "github.com/DataDog/datadog-agent/pkg/util/log" ) @@ -28,20 +27,18 @@ type linuxImpl struct { getSysProbeClient processnet.SysProbeUtilGetter time timer - ignoreCfg map[string]bool - containerProvider proccontainers.ContainerProvider + ignoreCfg map[string]bool ignoreProcs map[int]bool aliveServices map[int]*serviceInfo potentialServices map[int]*serviceInfo } -func newLinuxImpl(ignoreCfg map[string]bool, containerProvider proccontainers.ContainerProvider) (osImpl, error) { +func newLinuxImpl(ignoreCfg map[string]bool) (osImpl, error) { return &linuxImpl{ getSysProbeClient: processnet.GetRemoteSystemProbeUtil, time: realTime{}, ignoreCfg: ignoreCfg, - containerProvider: containerProvider, ignoreProcs: make(map[int]bool), aliveServices: make(map[int]*serviceInfo), potentialServices: make(map[int]*serviceInfo), @@ -134,13 +131,6 @@ func (li *linuxImpl) handlePotentialServices(events *serviceEvents, now time.Tim return } - // Get container IDs to enrich the service info with it. The SD check is - // supposed to run once every minute, so we use this duration for cache - // validity. - // TODO: use/find a global constant for this delay, to keep in sync with - // the check delay if it were to change. - containers := li.containerProvider.GetPidToCid(1 * time.Minute) - // potentialServices contains processes that we scanned in the previous // iteration and had open ports. We check if they are still alive in this // iteration, and if so, we send a start-service telemetry event. @@ -150,11 +140,6 @@ func (li *linuxImpl) handlePotentialServices(events *serviceEvents, now time.Tim svc.service.RSS = service.RSS svc.service.CPUCores = service.CPUCores - if id, ok := containers[pid]; ok { - svc.service.ContainerID = id - log.Debugf("[pid: %d] add containerID to process: %s", pid, id) - } - li.aliveServices[pid] = svc events.start = append(events.start, *svc) } diff --git a/pkg/collector/corechecks/servicediscovery/impl_linux_test.go b/pkg/collector/corechecks/servicediscovery/impl_linux_test.go index 762f5dcb264ee..07c7810d7230f 100644 --- a/pkg/collector/corechecks/servicediscovery/impl_linux_test.go +++ b/pkg/collector/corechecks/servicediscovery/impl_linux_test.go @@ -32,6 +32,10 @@ type testProc struct { cwd string } +const ( + dummyContainerID = "abcd" +) + var ( bootTimeMilli = uint64(time.Date(2000, 01, 01, 0, 0, 0, 0, time.UTC).UnixMilli()) procLaunchedMilli = bootTimeMilli + uint64((12 * time.Hour).Milliseconds()) @@ -79,6 +83,7 @@ var ( CPUCores: 1.5, CommandLine: []string{"test-service-1"}, StartTimeMilli: procLaunchedMilli, + ContainerID: dummyContainerID, } portTCP8080UpdatedRSS = model.Service{ PID: procTestService1.pid, @@ -91,6 +96,7 @@ var ( CPUCores: 1.5, CommandLine: []string{"test-service-1"}, StartTimeMilli: procLaunchedMilli, + ContainerID: dummyContainerID, } portTCP8080DifferentPID = model.Service{ PID: procTestService1DifferentPID.pid, @@ -102,6 +108,7 @@ var ( APMInstrumentation: string(apm.Injected), CommandLine: []string{"test-service-1"}, StartTimeMilli: procLaunchedMilli, + ContainerID: dummyContainerID, } portTCP8081 = model.Service{ PID: procIgnoreService1.pid, @@ -109,6 +116,7 @@ var ( GeneratedName: "ignore-1", Ports: []uint16{8081}, StartTimeMilli: procLaunchedMilli, + ContainerID: dummyContainerID, } portTCP5000 = model.Service{ PID: procPythonService.pid, @@ -118,6 +126,7 @@ var ( Ports: []uint16{5000}, CommandLine: pythonCommandLine, StartTimeMilli: procLaunchedMilli, + ContainerID: dummyContainerID, } portTCP5432 = model.Service{ PID: procTestService1Repeat.pid, @@ -126,6 +135,7 @@ var ( Ports: []uint16{5432}, CommandLine: []string{"test-service-1"}, StartTimeMilli: procLaunchedMilli, + ContainerID: dummyContainerID, } ) @@ -172,16 +182,6 @@ func Test_linuxImpl(t *testing.T) { time time.Time } - collectTargetPIDs := func(checkRuns []*checkRun) []int { - targetPIDs := make([]int, 0) - for _, cr := range checkRuns { - for _, service := range cr.servicesResp.Services { - targetPIDs = append(targetPIDs, service.PID) - } - } - return targetPIDs - } - tests := []struct { name string checkRun []*checkRun @@ -567,9 +567,7 @@ func Test_linuxImpl(t *testing.T) { defer ctrl.Finish() // check and mocks setup - targetPIDs := collectTargetPIDs(tc.checkRun) - cpStub := newContainerProviderStub(targetPIDs) - check := newCheck(cpStub) + check := newCheck() mSender := mocksender.NewMockSender(check.ID()) mSender.SetupAcceptAll() diff --git a/pkg/collector/corechecks/servicediscovery/module/comm_test.go b/pkg/collector/corechecks/servicediscovery/module/comm_test.go index 5aab786449896..f7b49d8834486 100644 --- a/pkg/collector/corechecks/servicediscovery/module/comm_test.go +++ b/pkg/collector/corechecks/servicediscovery/module/comm_test.go @@ -30,7 +30,8 @@ const ( // TestIgnoreComm checks that the 'sshd' command is ignored and the 'node' command is not func TestIgnoreComm(t *testing.T) { serverDir := buildFakeServer(t) - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(func() { cancel() }) @@ -58,7 +59,7 @@ func TestIgnoreComm(t *testing.T) { // TestIgnoreCommsLengths checks that the map contains names no longer than 15 bytes. func TestIgnoreCommsLengths(t *testing.T) { - discovery := newDiscovery() + discovery := newDiscovery(nil) require.NotEmpty(t, discovery) require.Equal(t, len(discovery.config.ignoreComms), 10) @@ -114,7 +115,7 @@ func TestShouldIgnoreComm(t *testing.T) { serverBin := buildTestBin(t) serverDir := filepath.Dir(serverBin) - discovery := newDiscovery() + discovery := newDiscovery(nil) require.NotEmpty(t, discovery) require.NotEmpty(t, discovery.config.ignoreComms) require.Equal(t, len(discovery.config.ignoreComms), 10) @@ -208,7 +209,7 @@ func BenchmarkProcName(b *testing.B) { // BenchmarkShouldIgnoreComm benchmarks reading of command name from /proc//comm. func BenchmarkShouldIgnoreComm(b *testing.B) { - discovery := newDiscovery() + discovery := newDiscovery(nil) cmd := startProcessLongComm(b) b.ResetTimer() diff --git a/pkg/collector/corechecks/servicediscovery/module/config_test.go b/pkg/collector/corechecks/servicediscovery/module/config_test.go index fe58eb53b4219..eddf27b1c3253 100644 --- a/pkg/collector/corechecks/servicediscovery/module/config_test.go +++ b/pkg/collector/corechecks/servicediscovery/module/config_test.go @@ -57,7 +57,7 @@ func TestConfigIgnoredComms(t *testing.T) { commsStr := strings.Join(test.comms, " ") // intentionally multiple spaces for sensitivity testing mockSystemProbe.SetWithoutSource("discovery.ignored_command_names", commsStr) - discovery := newDiscovery() + discovery := newDiscovery(nil) require.NotEmpty(t, discovery) require.Equal(t, len(discovery.config.ignoreComms), len(test.comms)) @@ -74,7 +74,7 @@ func TestConfigIgnoredComms(t *testing.T) { t.Run("check default config length", func(t *testing.T) { mock.NewSystemProbe(t) - discovery := newDiscovery() + discovery := newDiscovery(nil) require.NotEmpty(t, discovery) assert.Equal(t, len(discovery.config.ignoreComms), 10) @@ -84,7 +84,7 @@ func TestConfigIgnoredComms(t *testing.T) { mock.NewSystemProbe(t) t.Setenv("DD_DISCOVERY_IGNORED_COMMAND_NAMES", "dummy1 dummy2") - discovery := newDiscovery() + discovery := newDiscovery(nil) require.NotEmpty(t, discovery) _, found := discovery.config.ignoreComms["dummy1"] @@ -120,7 +120,7 @@ func TestConfigIgnoredServices(t *testing.T) { servicesStr := strings.Join(test.services, " ") // intentionally multiple spaces for sensitivity testing mockSystemProbe.SetWithoutSource("discovery.ignored_services", servicesStr) - discovery := newDiscovery() + discovery := newDiscovery(nil) require.NotEmpty(t, discovery) require.Equal(t, len(discovery.config.ignoreServices), len(test.services)) @@ -134,7 +134,7 @@ func TestConfigIgnoredServices(t *testing.T) { t.Run("check default number of services", func(t *testing.T) { mock.NewSystemProbe(t) - discovery := newDiscovery() + discovery := newDiscovery(nil) require.NotEmpty(t, discovery) assert.Equal(t, len(discovery.config.ignoreServices), 6) @@ -144,7 +144,7 @@ func TestConfigIgnoredServices(t *testing.T) { mock.NewSystemProbe(t) t.Setenv("DD_DISCOVERY_IGNORED_SERVICES", "service1 service2") - discovery := newDiscovery() + discovery := newDiscovery(nil) require.NotEmpty(t, discovery) _, found := discovery.config.ignoreServices["service1"] diff --git a/pkg/collector/corechecks/servicediscovery/module/ignore_proc_test.go b/pkg/collector/corechecks/servicediscovery/module/ignore_proc_test.go index b33486cfa1b90..90d0dc9d8dcb3 100644 --- a/pkg/collector/corechecks/servicediscovery/module/ignore_proc_test.go +++ b/pkg/collector/corechecks/servicediscovery/module/ignore_proc_test.go @@ -66,7 +66,7 @@ func TestShouldIgnorePid(t *testing.T) { _ = cmd.Process.Kill() }) - discovery := newDiscovery() + discovery := newDiscovery(nil) require.NotEmpty(t, discovery) proc, err := customNewProcess(int32(cmd.Process.Pid)) diff --git a/pkg/collector/corechecks/servicediscovery/module/impl_linux.go b/pkg/collector/corechecks/servicediscovery/module/impl_linux.go index 302bd089499e2..70cee8a18beee 100644 --- a/pkg/collector/corechecks/servicediscovery/module/impl_linux.go +++ b/pkg/collector/corechecks/servicediscovery/module/impl_linux.go @@ -33,6 +33,7 @@ import ( "github.com/DataDog/datadog-agent/pkg/languagedetection/privileged" "github.com/DataDog/datadog-agent/pkg/network" "github.com/DataDog/datadog-agent/pkg/process/procutil" + proccontainers "github.com/DataDog/datadog-agent/pkg/process/util/containers" "github.com/DataDog/datadog-agent/pkg/util/kernel" "github.com/DataDog/datadog-agent/pkg/util/log" ) @@ -81,9 +82,11 @@ type discovery struct { // lastCPUTimeUpdate is the last time lastGlobalCPUTime was updated. lastCPUTimeUpdate time.Time + + containerProvider proccontainers.ContainerProvider } -func newDiscovery() *discovery { +func newDiscovery(containerProvider proccontainers.ContainerProvider) *discovery { return &discovery{ config: newConfig(), mux: &sync.RWMutex{}, @@ -91,12 +94,14 @@ func newDiscovery() *discovery { ignorePids: make(map[int32]struct{}), privilegedDetector: privileged.NewLanguageDetector(), scrubber: procutil.NewDefaultDataScrubber(), + containerProvider: containerProvider, } } // NewDiscoveryModule creates a new discovery system probe module. -func NewDiscoveryModule(*sysconfigtypes.Config, module.FactoryDependencies) (module.Module, error) { - return newDiscovery(), nil +func NewDiscoveryModule(_ *sysconfigtypes.Config, deps module.FactoryDependencies) (module.Module, error) { + sharedContainerProvider := proccontainers.InitSharedContainerProvider(deps.WMeta, deps.Tagger) + return newDiscovery(sharedContainerProvider), nil } // GetStats returns the stats of the discovery module. @@ -623,6 +628,10 @@ func (s *discovery) getServices() (*[]model.Service, error) { var services []model.Service alivePids := make(map[int32]struct{}, len(pids)) + _, _, pidToCid, err := s.containerProvider.GetContainers(1*time.Minute, nil) + if err != nil { + log.Errorf("could not get containers: %s", err) + } for _, pid := range pids { alivePids[pid] = struct{}{} @@ -632,6 +641,10 @@ func (s *discovery) getServices() (*[]model.Service, error) { continue } + if id, ok := pidToCid[service.PID]; ok { + service.ContainerID = id + } + services = append(services, *service) } diff --git a/pkg/collector/corechecks/servicediscovery/module/impl_linux_test.go b/pkg/collector/corechecks/servicediscovery/module/impl_linux_test.go index bd71b5d00e6b5..6d5610c952a24 100644 --- a/pkg/collector/corechecks/servicediscovery/module/impl_linux_test.go +++ b/pkg/collector/corechecks/servicediscovery/module/impl_linux_test.go @@ -28,6 +28,7 @@ import ( "testing" "time" + "github.com/golang/mock/gomock" gorillamux "github.com/gorilla/mux" "github.com/prometheus/procfs" "github.com/shirou/gopsutil/v3/process" @@ -39,6 +40,7 @@ import ( "github.com/DataDog/datadog-agent/cmd/system-probe/config" "github.com/DataDog/datadog-agent/cmd/system-probe/config/types" "github.com/DataDog/datadog-agent/comp/core" + taggermock "github.com/DataDog/datadog-agent/comp/core/tagger/mock" workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" wmmock "github.com/DataDog/datadog-agent/comp/core/workloadmeta/fx-mock" "github.com/DataDog/datadog-agent/pkg/collector/corechecks/servicediscovery/apm" @@ -49,18 +51,24 @@ import ( "github.com/DataDog/datadog-agent/pkg/network/protocols/tls/nodejs" fileopener "github.com/DataDog/datadog-agent/pkg/network/usm/sharedlibraries/testutil" usmtestutil "github.com/DataDog/datadog-agent/pkg/network/usm/testutil" + proccontainersmocks "github.com/DataDog/datadog-agent/pkg/process/util/containers/mocks" "github.com/DataDog/datadog-agent/pkg/util/fxutil" "github.com/DataDog/datadog-agent/pkg/util/kernel" dockerutils "github.com/DataDog/datadog-agent/pkg/util/testutil/docker" ) -func setupDiscoveryModule(t *testing.T) string { +func setupDiscoveryModule(t *testing.T) (string, *proccontainersmocks.MockContainerProvider) { t.Helper() wmeta := fxutil.Test[workloadmeta.Component](t, core.MockBundle(), wmmock.MockModule(workloadmeta.NewParams()), ) + + tagger := taggermock.SetupFakeTagger(t) + mockCtrl := gomock.NewController(t) + mockContainerProvider := proccontainersmocks.NewMockContainerProvider(mockCtrl) + mux := gorillamux.NewRouter() cfg := &types.Config{ Enabled: true, @@ -71,25 +79,22 @@ func setupDiscoveryModule(t *testing.T) string { m := module.Factory{ Name: config.DiscoveryModule, ConfigNamespaces: []string{"discovery"}, - Fn: func(cfg *types.Config, deps module.FactoryDependencies) (module.Module, error) { - module, err := NewDiscoveryModule(cfg, deps) - if err != nil { - return nil, err - } + Fn: func(*types.Config, module.FactoryDependencies) (module.Module, error) { + module := newDiscovery(mockContainerProvider) + module.config.cpuUsageUpdateDelay = time.Second - module.(*discovery).config.cpuUsageUpdateDelay = time.Second return module, nil }, NeedsEBPF: func() bool { return false }, } - err := module.Register(cfg, mux, []module.Factory{m}, wmeta, nil) + err := module.Register(cfg, mux, []module.Factory{m}, wmeta, tagger, nil) require.NoError(t, err) srv := httptest.NewServer(mux) t.Cleanup(srv.Close) - return srv.URL + return srv.URL, mockContainerProvider } func getServices(t *testing.T, url string) []model.Service { @@ -191,7 +196,8 @@ func startProcessWithFile(t *testing.T, f *os.File) *exec.Cmd { // Check that we get (only) listening processes for all expected protocols. func TestBasic(t *testing.T) { - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() var expectedPIDs []int var unexpectedPIDs []int @@ -244,7 +250,8 @@ func TestBasic(t *testing.T) { // Check that we get all listening ports for a process func TestPorts(t *testing.T) { - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() var expectedPorts []uint16 var unexpectedPorts []uint16 @@ -290,7 +297,8 @@ func TestPorts(t *testing.T) { } func TestPortsLimits(t *testing.T) { - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() var expectedPorts []int @@ -324,7 +332,8 @@ func TestPortsLimits(t *testing.T) { } func TestServiceName(t *testing.T) { - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() listener, err := net.Listen("tcp", "") require.NoError(t, err) @@ -358,11 +367,13 @@ func TestServiceName(t *testing.T) { assert.Equal(t, portMap[pid].DDService, portMap[pid].Name) assert.Equal(t, "sleep", portMap[pid].GeneratedName) assert.False(t, portMap[pid].DDServiceInjected) + assert.Equal(t, portMap[pid].ContainerID, "") }, 30*time.Second, 100*time.Millisecond) } func TestInjectedServiceName(t *testing.T) { - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() createEnvsMemfd(t, []string{ "OTHER_ENV=test", @@ -388,7 +399,8 @@ func TestInjectedServiceName(t *testing.T) { } func TestAPMInstrumentationInjected(t *testing.T) { - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() createEnvsMemfd(t, []string{ "DD_INJECTION_ENABLED=service_name,tracer", @@ -484,7 +496,8 @@ func testCaptureWrappedCommands(t *testing.T, script string, commandWrapper []st } t.Cleanup(func() { _ = proc.Kill() }) - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() pid := int(proc.Pid) require.EventuallyWithT(t, func(collect *assert.CollectT) { svcMap := getServicesMap(t, url) @@ -524,7 +537,8 @@ func TestAPMInstrumentationProvided(t *testing.T) { } serverDir := buildFakeServer(t) - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() for name, test := range testCases { t.Run(name, func(t *testing.T) { @@ -605,7 +619,8 @@ func assertCPU(t *testing.T, url string, pid int) { func TestCommandLineSanitization(t *testing.T) { serverDir := buildFakeServer(t) - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(func() { cancel() }) @@ -636,7 +651,8 @@ func TestNodeDocker(t *testing.T) { nodeJSPID, err := nodejs.GetNodeJSDockerPID() require.NoError(t, err) - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() pid := int(nodeJSPID) require.EventuallyWithT(t, func(collect *assert.CollectT) { @@ -693,7 +709,8 @@ func TestAPMInstrumentationProvidedWithMaps(t *testing.T) { cmd, err := fileopener.OpenFromProcess(t, fake, test.lib) require.NoError(t, err) - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() pid := cmd.Process.Pid require.EventuallyWithT(t, func(collect *assert.CollectT) { @@ -709,7 +726,8 @@ func TestAPMInstrumentationProvidedWithMaps(t *testing.T) { // Check that we can get listening processes in other namespaces. func TestNamespaces(t *testing.T) { - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).AnyTimes() // Needed when changing namespaces runtime.LockOSThread() @@ -769,7 +787,7 @@ func TestNamespaces(t *testing.T) { // Check that we are able to find services inside Docker containers. func TestDocker(t *testing.T) { - url := setupDiscoveryModule(t) + url, mockContainerProvider := setupDiscoveryModule(t) dir, _ := testutil.CurDir() dockerCfg := dockerutils.NewComposeConfig("foo-server", @@ -794,6 +812,10 @@ func TestDocker(t *testing.T) { } if comm == "python-1111" { pid1111 = process.PID + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).Return(nil, nil, map[int]string{ + pid1111: "dummyCID", + }, nil) + break } } @@ -804,20 +826,17 @@ func TestDocker(t *testing.T) { require.Contains(t, portMap, pid1111) require.Contains(t, portMap[pid1111].Ports, uint16(1234)) + require.Contains(t, portMap[pid1111].ContainerID, "dummyCID") } // Check that the cache is cleaned when procceses die. func TestCache(t *testing.T) { - wmeta := fxutil.Test[workloadmeta.Component](t, - core.MockBundle(), - wmmock.MockModule(workloadmeta.NewParams()), - ) - deps := module.FactoryDependencies{ - WMeta: wmeta, - } - module, err := NewDiscoveryModule(nil, deps) - require.NoError(t, err) - discovery := module.(*discovery) + var err error + + mockCtrl := gomock.NewController(t) + mockContainerProvider := proccontainersmocks.NewMockContainerProvider(mockCtrl) + mockContainerProvider.EXPECT().GetContainers(1*time.Minute, nil).MinTimes(1) + discovery := newDiscovery(mockContainerProvider) ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(func() { cancel() }) diff --git a/pkg/collector/corechecks/servicediscovery/servicediscovery.go b/pkg/collector/corechecks/servicediscovery/servicediscovery.go index 5400e33272bd2..6f96ccc775bb5 100644 --- a/pkg/collector/corechecks/servicediscovery/servicediscovery.go +++ b/pkg/collector/corechecks/servicediscovery/servicediscovery.go @@ -20,7 +20,6 @@ import ( "github.com/DataDog/datadog-agent/pkg/collector/corechecks" "github.com/DataDog/datadog-agent/pkg/collector/corechecks/servicediscovery/model" pkgconfigsetup "github.com/DataDog/datadog-agent/pkg/config/setup" - proccontainers "github.com/DataDog/datadog-agent/pkg/process/util/containers" "github.com/DataDog/datadog-agent/pkg/util/log" "github.com/DataDog/datadog-agent/pkg/util/optional" ) @@ -59,7 +58,7 @@ type osImpl interface { DiscoverServices() (*discoveredServices, error) } -var newOSImpl func(ignoreCfg map[string]bool, containerProvider proccontainers.ContainerProvider) (osImpl, error) +var newOSImpl func(ignoreCfg map[string]bool) (osImpl, error) type config struct { IgnoreProcesses []string `yaml:"ignore_processes"` @@ -80,7 +79,6 @@ type Check struct { os osImpl sender *telemetrySender sentRepeatedEventPIDs map[int]bool - containerProvider proccontainers.ContainerProvider } // Factory creates a new check factory @@ -91,24 +89,17 @@ func Factory() optional.Option[func() check.Check] { return optional.NewNoneOption[func() check.Check]() } - sharedContainerProvider, err := proccontainers.GetSharedContainerProvider() - - if err != nil { - return optional.NewNoneOption[func() check.Check]() - } - return optional.NewOption(func() check.Check { - return newCheck(sharedContainerProvider) + return newCheck() }) } // TODO: add metastore param -func newCheck(containerProvider proccontainers.ContainerProvider) *Check { +func newCheck() *Check { return &Check{ CheckBase: corechecks.NewCheckBase(CheckName), cfg: &config{}, sentRepeatedEventPIDs: make(map[int]bool), - containerProvider: containerProvider, } } @@ -135,7 +126,7 @@ func (c *Check) Configure(senderManager sender.SenderManager, _ uint64, instance } c.sender = newTelemetrySender(s) - c.os, err = newOSImpl(ignoreCfg, c.containerProvider) + c.os, err = newOSImpl(ignoreCfg) if err != nil { return err } diff --git a/pkg/config/config_template.yaml b/pkg/config/config_template.yaml index 8d1afed4b8267..a858e50ce8b7f 100644 --- a/pkg/config/config_template.yaml +++ b/pkg/config/config_template.yaml @@ -1462,6 +1462,16 @@ api_key: ## collectors using the probabilistic sampler to ensure consistent sampling. # hash_seed: 0 + ## @param error_tracking_standalone - object - optional + ## Enables Error Tracking Standalone + ## + #error_tracking_standalone: + # + ## @param enabled - boolean - optional - default: false + ## @env DD_APM_ERROR_TRACKING_STANDALONE_ENABLED - boolean - optional - default: false + ## Enables or disables Error Tracking Standalone + # enabled: false + {{- if .InternalProfiling -}} ## @param profiling - custom object - optional diff --git a/pkg/config/setup/apm.go b/pkg/config/setup/apm.go index 420403407a353..f792f9c57dd81 100644 --- a/pkg/config/setup/apm.go +++ b/pkg/config/setup/apm.go @@ -112,6 +112,7 @@ func setupAPM(config pkgconfigmodel.Setup) { config.BindEnv("apm_config.probabilistic_sampler.enabled", "DD_APM_PROBABILISTIC_SAMPLER_ENABLED") config.BindEnv("apm_config.probabilistic_sampler.sampling_percentage", "DD_APM_PROBABILISTIC_SAMPLER_SAMPLING_PERCENTAGE") config.BindEnv("apm_config.probabilistic_sampler.hash_seed", "DD_APM_PROBABILISTIC_SAMPLER_HASH_SEED") + config.BindEnvAndSetDefault("apm_config.error_tracking_standalone.enabled", false, "DD_APM_ERROR_TRACKING_STANDALONE_ENABLED") config.BindEnv("apm_config.max_memory", "DD_APM_MAX_MEMORY") config.BindEnv("apm_config.max_cpu_percent", "DD_APM_MAX_CPU_PERCENT") diff --git a/pkg/trace/agent/agent.go b/pkg/trace/agent/agent.go index a10e015be49f5..ff2307a3aeb49 100644 --- a/pkg/trace/agent/agent.go +++ b/pkg/trace/agent/agent.go @@ -568,7 +568,8 @@ func (a *Agent) ProcessStats(in *pb.ClientStatsPayload, lang, tracerVersion, con a.ClientStatsAggregator.In <- a.processStats(in, lang, tracerVersion, containerID) } -// sample performs all sampling on the processedTrace modifying it as needed and returning if the trace should be kept and the number of events in the trace +// sample performs all sampling on the processedTrace modifying it as needed and returning if the trace should be kept +// and the number of events in the trace func (a *Agent) sample(now time.Time, ts *info.TagStats, pt *traceutil.ProcessedTrace) (keep bool, numEvents int) { // We have a `keep` that is different from pt's `DroppedTrace` field as `DroppedTrace` will be sent to intake. // For example: We want to maintain the overall trace level sampling decision for a trace with Analytics Events @@ -579,7 +580,7 @@ func (a *Agent) sample(now time.Time, ts *info.TagStats, pt *traceutil.Processed if checkAnalyticsEvents { events = a.getAnalyzedEvents(pt, ts) } - if !keep { + if !keep && !a.conf.ErrorTrackingStandalone { modified := sampler.SingleSpanSampling(pt) if !modified { // If there were no sampled spans, and we're not keeping the trace, let's use the analytics events @@ -630,13 +631,23 @@ func (a *Agent) getAnalyzedEvents(pt *traceutil.ProcessedTrace, ts *info.TagStat // runSamplers runs the agent's configured samplers on pt and returns the sampling decision along // with the sampling rate. // -// The rare sampler is run first, catching all rare traces early. If the probabilistic sampler is +// If the agent is set as Error Tracking Standalone, only the ErrorSampler is run (other samplers are bypassed). +// Otherwise, the rare sampler is run first, catching all rare traces early. If the probabilistic sampler is // enabled, it is run on the trace, followed by the error sampler. Otherwise, If the trace has a // priority set, the sampling priority is used with the Priority Sampler. When there is no priority // set, the NoPrioritySampler is run. Finally, if the trace has not been sampled by the other // samplers, the error sampler is run. func (a *Agent) runSamplers(now time.Time, ts *info.TagStats, pt traceutil.ProcessedTrace) (keep bool, checkAnalyticsEvents bool) { - // run this early to make sure the signature gets counted by the RareSampler. + // ETS: chunks that don't contain errors (or spans with exception span events) are all dropped. + if a.conf.ErrorTrackingStandalone { + if traceContainsError(pt.TraceChunk.Spans, true) { + pt.TraceChunk.Tags["_dd.error_tracking_standalone.error"] = "true" + return a.ErrorsSampler.Sample(now, pt.TraceChunk.Spans, pt.Root, pt.TracerEnv), false + } + return false, false + } + + // Run this early to make sure the signature gets counted by the RareSampler. rare := a.RareSampler.Sample(now, pt.TraceChunk, pt.TracerEnv) if a.conf.ProbabilisticSamplerEnabled { @@ -647,7 +658,7 @@ func (a *Agent) runSamplers(now time.Time, ts *info.TagStats, pt traceutil.Proce pt.TraceChunk.Tags[tagDecisionMaker] = probabilitySampling return true, true } - if traceContainsError(pt.TraceChunk.Spans) { + if traceContainsError(pt.TraceChunk.Spans, false) { return a.ErrorsSampler.Sample(now, pt.TraceChunk.Spans, pt.Root, pt.TracerEnv), true } return false, true @@ -684,22 +695,29 @@ func (a *Agent) runSamplers(now time.Time, ts *info.TagStats, pt traceutil.Proce return true, true } - if traceContainsError(pt.TraceChunk.Spans) { + if traceContainsError(pt.TraceChunk.Spans, false) { return a.ErrorsSampler.Sample(now, pt.TraceChunk.Spans, pt.Root, pt.TracerEnv), true } return false, true } -func traceContainsError(trace pb.Trace) bool { +func traceContainsError(trace pb.Trace, considerExceptionEvents bool) bool { for _, span := range trace { - if span.Error != 0 { + if span.Error != 0 || (considerExceptionEvents && spanContainsExceptionSpanEvent(span)) { return true } } return false } +func spanContainsExceptionSpanEvent(span *pb.Span) bool { + if hasExceptionSpanEvents, ok := span.Meta["_dd.span_events.has_exception"]; ok && hasExceptionSpanEvents == "true" { + return true + } + return false +} + func filteredByTags(root *pb.Span, require, reject []*config.Tag, requireRegex, rejectRegex []*config.TagRegex) bool { for _, tag := range reject { if v, ok := root.Meta[tag.K]; ok && (tag.V == "" || v == tag.V) { diff --git a/pkg/trace/agent/agent_test.go b/pkg/trace/agent/agent_test.go index 302f3c5d03606..78cd9f8e0a9c9 100644 --- a/pkg/trace/agent/agent_test.go +++ b/pkg/trace/agent/agent_test.go @@ -1273,7 +1273,7 @@ func TestSampling(t *testing.T) { } } -func TestSample(t *testing.T) { +func TestSampleTrace(t *testing.T) { now := time.Now() cfg := &config.AgentConfig{TargetTPS: 5, ErrorTPS: 1000, Features: make(map[string]struct{})} genSpan := func(decisionMaker string, priority sampler.SamplingPriority, err int32) traceutil.ProcessedTrace { @@ -1357,6 +1357,133 @@ func TestSample(t *testing.T) { } } +func TestSample(t *testing.T) { + now := time.Now() + cfg := &config.AgentConfig{TargetTPS: 5, ErrorTPS: 1000, Features: make(map[string]struct{})} + genSpan := func(decisionMaker string, priority sampler.SamplingPriority, err int32, exceptionInSpanEvent bool) traceutil.ProcessedTrace { + root := &pb.Span{ + Service: "serv1", + Start: now.UnixNano(), + Duration: (100 * time.Millisecond).Nanoseconds(), + Metrics: map[string]float64{"_top_level": 1}, + Error: err, // If 1, the Error Sampler will keep the trace, if 0, it will not be sampled + Meta: map[string]string{}, + } + if exceptionInSpanEvent { + root.Meta["_dd.span_events.has_exception"] = "true" // the Error Sampler will keep the trace + } + chunk := testutil.TraceChunkWithSpan(root) + if decisionMaker != "" { + chunk.Tags["_dd.p.dm"] = decisionMaker + } + pt := traceutil.ProcessedTrace{TraceChunk: chunk, Root: root} + pt.TraceChunk.Priority = int32(priority) + return pt + } + statsd := &statsd.NoOpClient{} + tests := map[string]struct { + trace traceutil.ProcessedTrace + etsEnabled bool + keep bool + keepWithFeature bool + }{ + "userdrop-error-manual-dm-unsampled": { + trace: genSpan("-4", sampler.PriorityUserDrop, 1, false), + keep: false, + keepWithFeature: false, + }, + "userkeep-error-no-dm-sampled": { + trace: genSpan("", sampler.PriorityUserKeep, 1, false), + keep: true, + keepWithFeature: true, + }, + "userkeep-error-agent-dm-sampled": { + trace: genSpan("-1", sampler.PriorityUserKeep, 1, false), + keep: true, + keepWithFeature: true, + }, + "autodrop-error-sampled": { + trace: genSpan("", sampler.PriorityAutoDrop, 1, false), + keep: true, + keepWithFeature: true, + }, + "autodrop-not-sampled": { + trace: genSpan("", sampler.PriorityAutoDrop, 0, false), + keep: false, + keepWithFeature: false, + }, + "ets-userdrop-error-manual-dm-unsampled": { + trace: genSpan("-4", sampler.PriorityUserDrop, 1, false), + etsEnabled: true, + keep: true, + keepWithFeature: true, + }, + "ets-userdrop-errorspanevent-manual-dm-unsampled": { + trace: genSpan("-4", sampler.PriorityUserDrop, 1, false), + etsEnabled: true, + keep: true, + keepWithFeature: true, + }, + "ets-userdrop-manual-dm-unsampled": { + trace: genSpan("-4", sampler.PriorityUserDrop, 0, false), + etsEnabled: true, + keep: false, + keepWithFeature: false, + }, + "ets-userkeep-error-no-dm-sampled": { + trace: genSpan("", sampler.PriorityUserKeep, 1, false), + etsEnabled: true, + keep: true, + keepWithFeature: true, + }, + "ets-userkeep-error-agent-dm-sampled": { + trace: genSpan("-1", sampler.PriorityUserKeep, 1, false), + etsEnabled: true, + keep: true, + keepWithFeature: true, + }, + "ets-autodrop-error-sampled": { + trace: genSpan("", sampler.PriorityAutoDrop, 1, false), + etsEnabled: true, + keep: true, + keepWithFeature: true, + }, + "ets-autodrop-errorspanevent-sampled": { + trace: genSpan("", sampler.PriorityAutoDrop, 0, true), + etsEnabled: true, + keep: true, + keepWithFeature: true, + }, + "ets-autodrop-not-sampled": { + trace: genSpan("", sampler.PriorityAutoDrop, 0, false), + etsEnabled: true, + keep: false, + keepWithFeature: false, + }, + } + for name, tt := range tests { + cfg.ErrorTrackingStandalone = tt.etsEnabled + a := &Agent{ + NoPrioritySampler: sampler.NewNoPrioritySampler(cfg, statsd), + ErrorsSampler: sampler.NewErrorsSampler(cfg, statsd), + PrioritySampler: sampler.NewPrioritySampler(cfg, &sampler.DynamicConfig{}, statsd), + RareSampler: sampler.NewRareSampler(config.New(), statsd), + EventProcessor: newEventProcessor(cfg, statsd), + conf: cfg, + } + t.Run(name, func(t *testing.T) { + keep, _ := a.sample(now, info.NewReceiverStats().GetTagStats(info.Tags{}), &tt.trace) + assert.Equal(t, tt.keep, keep) + assert.Equal(t, !tt.keep, tt.trace.TraceChunk.DroppedTrace) + cfg.Features["error_rare_sample_tracer_drop"] = struct{}{} + defer delete(cfg.Features, "error_rare_sample_tracer_drop") + keep, _ = a.sample(now, info.NewReceiverStats().GetTagStats(info.Tags{}), &tt.trace) + assert.Equal(t, tt.keepWithFeature, keep) + assert.Equal(t, !tt.keepWithFeature, tt.trace.TraceChunk.DroppedTrace) + }) + } +} + func TestSampleManualUserDropNoAnalyticsEvents(t *testing.T) { // This test exists to confirm previous behavior where we did not extract nor tag analytics events on // user manual drop traces diff --git a/pkg/trace/api/otlp.go b/pkg/trace/api/otlp.go index 9dda5532f1338..7272df819a8e0 100644 --- a/pkg/trace/api/otlp.go +++ b/pkg/trace/api/otlp.go @@ -549,6 +549,7 @@ func (o *OTLPReceiver) convertSpan(rattr map[string]string, lib pcommon.Instrume if in.Events().Len() > 0 { transform.SetMetaOTLP(span, "events", transform.MarshalEvents(in.Events())) } + transform.TagSpanIfContainsExceptionEvent(in, span) if in.Links().Len() > 0 { transform.SetMetaOTLP(span, "_dd.span_links", transform.MarshalLinks(in.Links())) } diff --git a/pkg/trace/api/otlp_test.go b/pkg/trace/api/otlp_test.go index b3717ec09d990..bd6377ff4612d 100644 --- a/pkg/trace/api/otlp_test.go +++ b/pkg/trace/api/otlp_test.go @@ -1702,22 +1702,23 @@ func testOTelSpanToDDSpan(enableOperationAndResourceNameV2 bool, t *testing.T) { Duration: 200000000, Error: 1, Meta: map[string]string{ - "name": "john", - "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", - "env": "staging", - "otel.status_code": "Error", - "otel.status_description": "Error", - "otel.library.name": "ddtracer", - "otel.library.version": "v2", - "service.version": "v1.2.3", - "w3c.tracestate": "state", - "version": "v1.2.3", - "events": `[{"time_unix_nano":123,"name":"boom","attributes":{"key":"Out of memory","accuracy":2.4},"dropped_attributes_count":2},{"time_unix_nano":456,"name":"exception","attributes":{"exception.message":"Out of memory","exception.type":"mem","exception.stacktrace":"1/2/3"},"dropped_attributes_count":2}]`, - "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128", "attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, - "error.msg": "Out of memory", - "error.type": "mem", - "error.stack": "1/2/3", - "span.kind": "server", + "name": "john", + "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", + "env": "staging", + "otel.status_code": "Error", + "otel.status_description": "Error", + "otel.library.name": "ddtracer", + "otel.library.version": "v2", + "service.version": "v1.2.3", + "w3c.tracestate": "state", + "version": "v1.2.3", + "events": `[{"time_unix_nano":123,"name":"boom","attributes":{"key":"Out of memory","accuracy":2.4},"dropped_attributes_count":2},{"time_unix_nano":456,"name":"exception","attributes":{"exception.message":"Out of memory","exception.type":"mem","exception.stacktrace":"1/2/3"},"dropped_attributes_count":2}]`, + "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128", "attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, + "error.msg": "Out of memory", + "error.type": "mem", + "error.stack": "1/2/3", + "span.kind": "server", + "_dd.span_events.has_exception": "true", }, Metrics: map[string]float64{ "approx": 1.2, @@ -1829,25 +1830,26 @@ func testOTelSpanToDDSpan(enableOperationAndResourceNameV2 bool, t *testing.T) { Duration: 200000000, Error: 1, Meta: map[string]string{ - "name": "john", - "env": "", - "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", - "otel.status_code": "Error", - "otel.status_description": "Error", - "otel.library.name": "ddtracer", - "otel.library.version": "v2", - "service.version": "v1.2.3", - "w3c.tracestate": "state", - "version": "v1.2.3", - "events": "[{\"time_unix_nano\":123,\"name\":\"boom\",\"attributes\":{\"message\":\"Out of memory\",\"accuracy\":2.4},\"dropped_attributes_count\":2},{\"time_unix_nano\":456,\"name\":\"exception\",\"attributes\":{\"exception.message\":\"Out of memory\",\"exception.type\":\"mem\",\"exception.stacktrace\":\"1/2/3\"},\"dropped_attributes_count\":2}]", - "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128","attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, - "error.msg": "Out of memory", - "error.type": "mem", - "error.stack": "1/2/3", - "http.method": "GET", - "http.route": "/path", - "peer.service": "userbase", - "span.kind": "server", + "name": "john", + "env": "", + "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", + "otel.status_code": "Error", + "otel.status_description": "Error", + "otel.library.name": "ddtracer", + "otel.library.version": "v2", + "service.version": "v1.2.3", + "w3c.tracestate": "state", + "version": "v1.2.3", + "events": "[{\"time_unix_nano\":123,\"name\":\"boom\",\"attributes\":{\"message\":\"Out of memory\",\"accuracy\":2.4},\"dropped_attributes_count\":2},{\"time_unix_nano\":456,\"name\":\"exception\",\"attributes\":{\"exception.message\":\"Out of memory\",\"exception.type\":\"mem\",\"exception.stacktrace\":\"1/2/3\"},\"dropped_attributes_count\":2}]", + "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128","attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, + "error.msg": "Out of memory", + "error.type": "mem", + "error.stack": "1/2/3", + "http.method": "GET", + "http.route": "/path", + "peer.service": "userbase", + "span.kind": "server", + "_dd.span_events.has_exception": "true", }, Metrics: map[string]float64{ "approx": 1.2, @@ -1958,24 +1960,25 @@ func testOTelSpanToDDSpan(enableOperationAndResourceNameV2 bool, t *testing.T) { Duration: 200000000, Error: 1, Meta: map[string]string{ - "name": "john", - "env": "staging", - "otel.status_code": "Error", - "otel.status_description": "Error", - "otel.library.name": "ddtracer", - "otel.library.version": "v2", - "service.version": "v1.2.3", - "w3c.tracestate": "state", - "version": "v1.2.3", - "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", - "events": "[{\"time_unix_nano\":123,\"name\":\"boom\",\"attributes\":{\"message\":\"Out of memory\",\"accuracy\":2.4},\"dropped_attributes_count\":2},{\"time_unix_nano\":456,\"name\":\"exception\",\"attributes\":{\"exception.message\":\"Out of memory\",\"exception.type\":\"mem\",\"exception.stacktrace\":\"1/2/3\"},\"dropped_attributes_count\":2}]", - "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128","attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, - "error.msg": "Out of memory", - "error.type": "mem", - "error.stack": "1/2/3", - "http.method": "GET", - "http.route": "/path", - "span.kind": "server", + "name": "john", + "env": "staging", + "otel.status_code": "Error", + "otel.status_description": "Error", + "otel.library.name": "ddtracer", + "otel.library.version": "v2", + "service.version": "v1.2.3", + "w3c.tracestate": "state", + "version": "v1.2.3", + "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", + "events": "[{\"time_unix_nano\":123,\"name\":\"boom\",\"attributes\":{\"message\":\"Out of memory\",\"accuracy\":2.4},\"dropped_attributes_count\":2},{\"time_unix_nano\":456,\"name\":\"exception\",\"attributes\":{\"exception.message\":\"Out of memory\",\"exception.type\":\"mem\",\"exception.stacktrace\":\"1/2/3\"},\"dropped_attributes_count\":2}]", + "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128","attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, + "error.msg": "Out of memory", + "error.type": "mem", + "error.stack": "1/2/3", + "http.method": "GET", + "http.route": "/path", + "span.kind": "server", + "_dd.span_events.has_exception": "true", }, Metrics: map[string]float64{ "approx": 1.2, @@ -2397,22 +2400,23 @@ func testOTLPConvertSpan(enableOperationAndResourceNameV2 bool, t *testing.T) { Duration: 200000000, Error: 1, Meta: map[string]string{ - "name": "john", - "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", - "env": "staging", - "otel.status_code": "Error", - "otel.status_description": "Error", - "otel.library.name": "ddtracer", - "otel.library.version": "v2", - "service.version": "v1.2.3", - "w3c.tracestate": "state", - "version": "v1.2.3", - "events": `[{"time_unix_nano":123,"name":"boom","attributes":{"key":"Out of memory","accuracy":2.4},"dropped_attributes_count":2},{"time_unix_nano":456,"name":"exception","attributes":{"exception.message":"Out of memory","exception.type":"mem","exception.stacktrace":"1/2/3"},"dropped_attributes_count":2}]`, - "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128", "attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, - "error.msg": "Out of memory", - "error.type": "mem", - "error.stack": "1/2/3", - "span.kind": "server", + "name": "john", + "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", + "env": "staging", + "otel.status_code": "Error", + "otel.status_description": "Error", + "otel.library.name": "ddtracer", + "otel.library.version": "v2", + "service.version": "v1.2.3", + "w3c.tracestate": "state", + "version": "v1.2.3", + "events": `[{"time_unix_nano":123,"name":"boom","attributes":{"key":"Out of memory","accuracy":2.4},"dropped_attributes_count":2},{"time_unix_nano":456,"name":"exception","attributes":{"exception.message":"Out of memory","exception.type":"mem","exception.stacktrace":"1/2/3"},"dropped_attributes_count":2}]`, + "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128", "attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, + "error.msg": "Out of memory", + "error.type": "mem", + "error.stack": "1/2/3", + "span.kind": "server", + "_dd.span_events.has_exception": "true", }, Metrics: map[string]float64{ "approx": 1.2, @@ -2524,26 +2528,27 @@ func testOTLPConvertSpan(enableOperationAndResourceNameV2 bool, t *testing.T) { Duration: 200000000, Error: 1, Meta: map[string]string{ - "name": "john", - "env": "prod", - "deployment.environment": "prod", - "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", - "otel.status_code": "Error", - "otel.status_description": "Error", - "otel.library.name": "ddtracer", - "otel.library.version": "v2", - "service.version": "v1.2.3", - "w3c.tracestate": "state", - "version": "v1.2.3", - "events": "[{\"time_unix_nano\":123,\"name\":\"boom\",\"attributes\":{\"message\":\"Out of memory\",\"accuracy\":2.4},\"dropped_attributes_count\":2},{\"time_unix_nano\":456,\"name\":\"exception\",\"attributes\":{\"exception.message\":\"Out of memory\",\"exception.type\":\"mem\",\"exception.stacktrace\":\"1/2/3\"},\"dropped_attributes_count\":2}]", - "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128","attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, - "error.msg": "Out of memory", - "error.type": "mem", - "error.stack": "1/2/3", - "http.method": "GET", - "http.route": "/path", - "peer.service": "userbase", - "span.kind": "server", + "name": "john", + "env": "prod", + "deployment.environment": "prod", + "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", + "otel.status_code": "Error", + "otel.status_description": "Error", + "otel.library.name": "ddtracer", + "otel.library.version": "v2", + "service.version": "v1.2.3", + "w3c.tracestate": "state", + "version": "v1.2.3", + "events": "[{\"time_unix_nano\":123,\"name\":\"boom\",\"attributes\":{\"message\":\"Out of memory\",\"accuracy\":2.4},\"dropped_attributes_count\":2},{\"time_unix_nano\":456,\"name\":\"exception\",\"attributes\":{\"exception.message\":\"Out of memory\",\"exception.type\":\"mem\",\"exception.stacktrace\":\"1/2/3\"},\"dropped_attributes_count\":2}]", + "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128","attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, + "error.msg": "Out of memory", + "error.type": "mem", + "error.stack": "1/2/3", + "http.method": "GET", + "http.route": "/path", + "peer.service": "userbase", + "span.kind": "server", + "_dd.span_events.has_exception": "true", }, Metrics: map[string]float64{ "approx": 1.2, @@ -2654,24 +2659,25 @@ func testOTLPConvertSpan(enableOperationAndResourceNameV2 bool, t *testing.T) { Duration: 200000000, Error: 1, Meta: map[string]string{ - "name": "john", - "env": "staging", - "otel.status_code": "Error", - "otel.status_description": "Error", - "otel.library.name": "ddtracer", - "otel.library.version": "v2", - "service.version": "v1.2.3", - "w3c.tracestate": "state", - "version": "v1.2.3", - "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", - "events": "[{\"time_unix_nano\":123,\"name\":\"boom\",\"attributes\":{\"message\":\"Out of memory\",\"accuracy\":2.4},\"dropped_attributes_count\":2},{\"time_unix_nano\":456,\"name\":\"exception\",\"attributes\":{\"exception.message\":\"Out of memory\",\"exception.type\":\"mem\",\"exception.stacktrace\":\"1/2/3\"},\"dropped_attributes_count\":2}]", - "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128","attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, - "error.msg": "Out of memory", - "error.type": "mem", - "error.stack": "1/2/3", - "http.method": "GET", - "http.route": "/path", - "span.kind": "server", + "name": "john", + "env": "staging", + "otel.status_code": "Error", + "otel.status_description": "Error", + "otel.library.name": "ddtracer", + "otel.library.version": "v2", + "service.version": "v1.2.3", + "w3c.tracestate": "state", + "version": "v1.2.3", + "otel.trace_id": "72df520af2bde7a5240031ead750e5f3", + "events": "[{\"time_unix_nano\":123,\"name\":\"boom\",\"attributes\":{\"message\":\"Out of memory\",\"accuracy\":2.4},\"dropped_attributes_count\":2},{\"time_unix_nano\":456,\"name\":\"exception\",\"attributes\":{\"exception.message\":\"Out of memory\",\"exception.type\":\"mem\",\"exception.stacktrace\":\"1/2/3\"},\"dropped_attributes_count\":2}]", + "_dd.span_links": `[{"trace_id":"fedcba98765432100123456789abcdef","span_id":"abcdef0123456789","trace_state":"dd=asdf256,ee=jkl;128","attributes":{"a1":"v1","a2":"v2"},"dropped_attributes_count":24},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","attributes":{"a3":"v2","a4":"v4"}},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210","dropped_attributes_count":2},{"trace_id":"abcdef0123456789abcdef0123456789","span_id":"fedcba9876543210"}]`, + "error.msg": "Out of memory", + "error.type": "mem", + "error.stack": "1/2/3", + "http.method": "GET", + "http.route": "/path", + "span.kind": "server", + "_dd.span_events.has_exception": "true", }, Metrics: map[string]float64{ "approx": 1.2, diff --git a/pkg/trace/config/config.go b/pkg/trace/config/config.go index 7a86fc71ac8e4..bc84e401c88bf 100644 --- a/pkg/trace/config/config.go +++ b/pkg/trace/config/config.go @@ -320,6 +320,9 @@ type AgentConfig struct { ProbabilisticSamplerHashSeed uint32 ProbabilisticSamplerSamplingPercentage float32 + // Error Tracking Standalone + ErrorTrackingStandalone bool + // Receiver ReceiverEnabled bool // specifies whether Receiver listeners are enabled. Unless OTLPReceiver is used, this should always be true. ReceiverHost string @@ -502,6 +505,8 @@ func New() *AgentConfig { RareSamplerCooldownPeriod: 5 * time.Minute, RareSamplerCardinality: 200, + ErrorTrackingStandalone: false, + ReceiverEnabled: true, ReceiverHost: "localhost", ReceiverPort: 8126, diff --git a/pkg/trace/transform/transform.go b/pkg/trace/transform/transform.go index 3a02df7cd2add..8582b8c12e2c5 100644 --- a/pkg/trace/transform/transform.go +++ b/pkg/trace/transform/transform.go @@ -128,6 +128,7 @@ func OtelSpanToDDSpan( if otelspan.Events().Len() > 0 { ddspan.Meta["events"] = MarshalEvents(otelspan.Events()) } + TagSpanIfContainsExceptionEvent(otelspan, ddspan) if otelspan.Links().Len() > 0 { ddspan.Meta["_dd.span_links"] = MarshalLinks(otelspan.Links()) } @@ -195,6 +196,16 @@ func OtelSpanToDDSpan( return ddspan } +// TagSpanIfContainsExceptionEvent tags spans that contain at least on exception span event. +func TagSpanIfContainsExceptionEvent(otelspan ptrace.Span, ddspan *pb.Span) { + for i := range otelspan.Events().Len() { + if otelspan.Events().At(i).Name() == "exception" { + ddspan.Meta["_dd.span_events.has_exception"] = "true" + return + } + } +} + // MarshalEvents marshals events into JSON. func MarshalEvents(events ptrace.SpanEventSlice) string { var str strings.Builder diff --git a/releasenotes/notes/add-error-tracking-standalone-config-option-1b88cedc9e164127.yaml b/releasenotes/notes/add-error-tracking-standalone-config-option-1b88cedc9e164127.yaml new file mode 100644 index 0000000000000..12a123d78beba --- /dev/null +++ b/releasenotes/notes/add-error-tracking-standalone-config-option-1b88cedc9e164127.yaml @@ -0,0 +1,13 @@ +# Each section from every release note are combined when the +# CHANGELOG.rst is rendered. So the text needs to be worded so that +# it does not depend on any information only available in another +# section. This may mean repeating some details, but each section +# must be readable independently of the other. +# +# Each section note must be formatted as reStructuredText. +--- +features: + - | + APM: Introducing the Error Tracking Standalone config option. Only span chunks + that contain errors or exception OpenTelemetry span events are taken into + consideration by sampling. diff --git a/tasks/__init__.py b/tasks/__init__.py index 12753e9a67e68..93ce84f1c37b2 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -62,6 +62,7 @@ trace_agent, vim, vscode, + worktree, ) from tasks.build_tags import audit_tag_impact, print_default_build_tags from tasks.components import lint_components, lint_fxutil_oneshot_test @@ -210,6 +211,7 @@ ns.add_collection(collector) ns.add_collection(invoke_unit_tests) ns.add_collection(debug) +ns.add_collection(worktree) ns.configure( { "run": { diff --git a/tasks/agent.py b/tasks/agent.py index af988249697a8..3399e4d7c4a58 100644 --- a/tasks/agent.py +++ b/tasks/agent.py @@ -12,7 +12,7 @@ import tempfile from invoke import task -from invoke.exceptions import Exit, ParseError +from invoke.exceptions import Exit from tasks.build_tags import filter_incompatible_tags, get_build_tags, get_default_build_tags from tasks.devcontainer import run_on_devcontainer @@ -359,15 +359,10 @@ def system_tests(_): @task -def image_build(ctx, arch='amd64', base_dir="omnibus", python_version="2", skip_tests=False, tag=None, push=False): +def image_build(ctx, arch='amd64', base_dir="omnibus", skip_tests=False, tag=None, push=False): """ Build the docker image """ - BOTH_VERSIONS = ["both", "2+3"] - VALID_VERSIONS = ["2", "3"] + BOTH_VERSIONS - if python_version not in VALID_VERSIONS: - raise ParseError("provided python_version is invalid") - build_context = "Dockerfiles/agent" base_dir = base_dir or os.environ["OMNIBUS_BASE_DIR"] pkg_dir = os.path.join(base_dir, 'pkg') @@ -386,8 +381,6 @@ def image_build(ctx, arch='amd64', base_dir="omnibus", python_version="2", skip_ tag = AGENT_TAG common_build_opts = f"-t {tag} -f {dockerfile_path}" - if python_version not in BOTH_VERSIONS: - common_build_opts = f"{common_build_opts} --build-arg PYTHON_VERSION={python_version}" # Build with the testing target if not skip_tests: diff --git a/tasks/git.py b/tasks/git.py index 658c6615cd621..8125035b04320 100644 --- a/tasks/git.py +++ b/tasks/git.py @@ -4,15 +4,19 @@ from invoke.exceptions import Exit from tasks.libs.common.color import color_message -from tasks.libs.common.git import get_current_branch +from tasks.libs.common.git import get_current_branch, get_default_branch @task def check_protected_branch(ctx): local_branch = get_current_branch(ctx) - if local_branch == 'main': - print(color_message("You're about to commit or push to the main, are you sure this is what you want?", "red")) + if local_branch == get_default_branch(): + print( + color_message( + f"You're about to commit or push to {get_default_branch()}, are you sure this is what you want?", "red" + ) + ) raise Exit(code=1) if re.fullmatch(r'^[0-9]+\.[0-9]+\.x$', local_branch): diff --git a/tasks/github_tasks.py b/tasks/github_tasks.py index b5723f232cd93..abe46fd6ff366 100644 --- a/tasks/github_tasks.py +++ b/tasks/github_tasks.py @@ -19,8 +19,9 @@ trigger_macos_workflow, ) from tasks.libs.common.color import color_message -from tasks.libs.common.constants import DEFAULT_BRANCH, DEFAULT_INTEGRATIONS_CORE_BRANCH +from tasks.libs.common.constants import DEFAULT_INTEGRATIONS_CORE_BRANCH from tasks.libs.common.datadog_api import create_gauge, send_event, send_metrics +from tasks.libs.common.git import get_default_branch from tasks.libs.common.junit_upload_core import repack_macos_junit_tar from tasks.libs.common.utils import get_git_pretty_ref from tasks.libs.owners.linter import codeowner_has_orphans, directory_has_packages_without_owner @@ -36,7 +37,7 @@ def concurrency_key(): current_ref = get_git_pretty_ref() # We want workflows to run to completion on the default branch and release branches - if re.search(rf'^({DEFAULT_BRANCH}|\d+\.\d+\.x)$', current_ref): + if re.search(rf'^({get_default_branch()}|\d+\.\d+\.x)$', current_ref): return None return current_ref @@ -68,7 +69,7 @@ def _trigger_macos_workflow(release, destination=None, retry_download=0, retry_i def trigger_macos( _, workflow_type="build", - datadog_agent_ref=DEFAULT_BRANCH, + datadog_agent_ref=None, release_version="nightly-a7", major_version="7", destination=".", @@ -79,6 +80,13 @@ def trigger_macos( test_washer=False, integrations_core_ref=DEFAULT_INTEGRATIONS_CORE_BRANCH, ): + """ + Args: + datadog_agent_ref: If None, will be the default branch. + """ + + datadog_agent_ref = datadog_agent_ref or get_default_branch() + if workflow_type == "build": conclusion = _trigger_macos_workflow( # Provide the release version to be able to fetch the associated diff --git a/tasks/gotest.py b/tasks/gotest.py index ceaee2f020bc2..3ef5ebc9de905 100644 --- a/tasks/gotest.py +++ b/tasks/gotest.py @@ -514,12 +514,13 @@ def get_modified_packages(ctx, build_tags=None, lint=False) -> list[GoModule]: modules_to_test[best_module_path] = GoModule(best_module_path, test_targets=[relative_target]) # Clean up duplicated paths to reduce Go test cmd length + default_modules = get_default_modules() for module in modules_to_test: modules_to_test[module].test_targets = clean_nested_paths(modules_to_test[module].test_targets) if ( len(modules_to_test[module].test_targets) >= WINDOWS_MAX_PACKAGES_NUMBER ): # With more packages we can reach the limit of the command line length on Windows - modules_to_test[module].test_targets = get_default_modules()[module].test_targets + modules_to_test[module].test_targets = default_modules[module].test_targets print("Running tests for the following modules:") for module in modules_to_test: @@ -752,16 +753,17 @@ def format_packages(ctx: Context, impacted_packages: set[str], build_tags: list[ packages = [f'{package.replace("github.com/DataDog/datadog-agent/", "./")}' for package in impacted_packages] modules_to_test = {} + default_modules = get_default_modules() for package in packages: module_path = get_go_module(package) # Check if the module is in the target list of the modules we want to test - if module_path not in get_default_modules() or not get_default_modules()[module_path].should_test(): + if module_path not in default_modules or not default_modules[module_path].should_test(): continue # Check if the package is in the target list of the module we want to test targeted = False - for target in get_default_modules()[module_path].test_targets: + for target in default_modules[module_path].test_targets: if normpath(os.path.join(module_path, target)) in package: targeted = True break @@ -784,12 +786,13 @@ def format_packages(ctx: Context, impacted_packages: set[str], build_tags: list[ modules_to_test[module_path] = GoModule(module_path, test_targets=[relative_target]) # Clean up duplicated paths to reduce Go test cmd length + default_modules = get_default_modules() for module in modules_to_test: modules_to_test[module].test_targets = clean_nested_paths(modules_to_test[module].test_targets) if ( len(modules_to_test[module].test_targets) >= WINDOWS_MAX_PACKAGES_NUMBER ): # With more packages we can reach the limit of the command line length on Windows - modules_to_test[module].test_targets = get_default_modules()[module].test_targets + modules_to_test[module].test_targets = default_modules[module].test_targets module_to_remove = [] # Clean up to avoid running tests on package with no Go files matching build tags diff --git a/tasks/libs/ciproviders/github_actions_tools.py b/tasks/libs/ciproviders/github_actions_tools.py index 8bef945f46460..9399302271b5d 100644 --- a/tasks/libs/ciproviders/github_actions_tools.py +++ b/tasks/libs/ciproviders/github_actions_tools.py @@ -11,13 +11,13 @@ from tasks.libs.ciproviders.github_api import GithubAPI from tasks.libs.common.color import color_message -from tasks.libs.common.utils import DEFAULT_BRANCH +from tasks.libs.common.git import get_default_branch def trigger_macos_workflow( workflow_name="macos.yaml", github_action_ref="master", - datadog_agent_ref=DEFAULT_BRANCH, + datadog_agent_ref=None, release_version=None, major_version=None, gitlab_pipeline_id=None, @@ -31,6 +31,8 @@ def trigger_macos_workflow( """ Trigger a workflow to build a MacOS Agent. """ + + datadog_agent_ref = datadog_agent_ref or get_default_branch() inputs = {} if datadog_agent_ref is not None: diff --git a/tasks/libs/ciproviders/github_api.py b/tasks/libs/ciproviders/github_api.py index 2d61567f5016b..f8ed33e3a3252 100644 --- a/tasks/libs/ciproviders/github_api.py +++ b/tasks/libs/ciproviders/github_api.py @@ -13,6 +13,7 @@ from tasks.libs.common.color import color_message from tasks.libs.common.constants import GITHUB_REPO_NAME +from tasks.libs.common.git import get_default_branch try: import semver @@ -494,7 +495,7 @@ def create_release_pr(title, base_branch, target_branch, version, changelog_pr=F ] if changelog_pr: - labels.append("backport/main") + labels.append(f"backport/{get_default_branch()}") updated_pr = github.update_pr( pull_number=pr.number, diff --git a/tasks/libs/ciproviders/gitlab_api.py b/tasks/libs/ciproviders/gitlab_api.py index 212e2ce38f142..0cf4c429b59b0 100644 --- a/tasks/libs/ciproviders/gitlab_api.py +++ b/tasks/libs/ciproviders/gitlab_api.py @@ -23,8 +23,7 @@ from invoke.exceptions import Exit from tasks.libs.common.color import Color, color_message -from tasks.libs.common.constants import DEFAULT_BRANCH -from tasks.libs.common.git import get_common_ancestor, get_current_branch +from tasks.libs.common.git import get_common_ancestor, get_current_branch, get_default_branch from tasks.libs.common.utils import retry_function BASE_URL = "https://gitlab.ddbuild.io" @@ -1214,7 +1213,7 @@ def compute_gitlab_ci_config_diff(ctx, before: str, after: str): after_name = after or "local files" # The before commit is the LCA commit between before and after - before = before or DEFAULT_BRANCH + before = before or get_default_branch() before = get_common_ancestor(ctx, before, after or "HEAD") print(f'Getting after changes config ({color_message(after_name, Color.BOLD)})') diff --git a/tasks/libs/common/constants.py b/tasks/libs/common/constants.py index ff9dc7a285a42..0b100a3432263 100644 --- a/tasks/libs/common/constants.py +++ b/tasks/libs/common/constants.py @@ -1,4 +1,3 @@ -DEFAULT_BRANCH = "main" DEFAULT_INTEGRATIONS_CORE_BRANCH = "master" GITHUB_ORG = "DataDog" REPO_NAME = "datadog-agent" diff --git a/tasks/libs/common/git.py b/tasks/libs/common/git.py index 8c7b9d9e2f080..6606a4ce522c2 100644 --- a/tasks/libs/common/git.py +++ b/tasks/libs/common/git.py @@ -5,10 +5,10 @@ from contextlib import contextmanager from typing import TYPE_CHECKING +from invoke import Context from invoke.exceptions import Exit from tasks.libs.common.color import Color, color_message -from tasks.libs.common.constants import DEFAULT_BRANCH from tasks.libs.common.user_interactions import yes_no_question if TYPE_CHECKING: @@ -90,7 +90,9 @@ def get_file_modifications( return modifications -def get_modified_files(ctx, base_branch="main") -> list[str]: +def get_modified_files(ctx, base_branch=None) -> list[str]: + base_branch = base_branch or get_default_branch() + return get_file_modifications( ctx, base_branch=base_branch, added=True, modified=True, only_names=True, no_renames=True ) @@ -100,7 +102,23 @@ def get_current_branch(ctx) -> str: return ctx.run("git rev-parse --abbrev-ref HEAD", hide=True).stdout.strip() -def get_common_ancestor(ctx, branch, base=DEFAULT_BRANCH) -> str: +def is_agent6(ctx) -> bool: + return get_current_branch(ctx).startswith("6.") + + +def get_default_branch(): + """Returns the default git branch given the current context (agent 6 / 7).""" + + # We create a context to avoid passing context in each function + # This context is used to get the current branch so there is no side effect + ctx = Context() + + return '6.53.x' if is_agent6(ctx) else 'main' + + +def get_common_ancestor(ctx, branch, base=None) -> str: + base = base or get_default_branch() + return ctx.run(f"git merge-base {branch} {base}", hide=True).stdout.strip() @@ -132,7 +150,7 @@ def get_main_parent_commit(ctx) -> str: """ Get the commit sha your current branch originated from """ - return ctx.run("git merge-base HEAD origin/main", hide=True).stdout.strip() + return ctx.run(f"git merge-base HEAD origin/{get_default_branch()}", hide=True).stdout.strip() def check_base_branch(branch, release_version): @@ -140,7 +158,7 @@ def check_base_branch(branch, release_version): Checks if the given branch is either the default branch or the release branch associated with the given release version. """ - return branch == DEFAULT_BRANCH or branch == release_version.branch() + return branch == get_default_branch() or branch == release_version.branch() def try_git_command(ctx, git_command): diff --git a/tasks/libs/common/gomodules.py b/tasks/libs/common/gomodules.py index ffbdd9f03d10d..357ab03dc515e 100644 --- a/tasks/libs/common/gomodules.py +++ b/tasks/libs/common/gomodules.py @@ -7,13 +7,13 @@ import sys from collections.abc import Callable from dataclasses import dataclass -from functools import lru_cache from pathlib import Path from typing import ClassVar import yaml import tasks +from tasks.libs.common.utils import agent_working_directory class ConfigDumper(yaml.SafeDumper): @@ -305,7 +305,6 @@ def dependency_path(self, agent_version): AGENT_MODULE_PATH_PREFIX = "github.com/DataDog/datadog-agent/" -@lru_cache def get_default_modules(base_dir: Path | None = None) -> dict[str, GoModule]: """Load the default modules from the modules.yml file. @@ -313,6 +312,8 @@ def get_default_modules(base_dir: Path | None = None) -> dict[str, GoModule]: base_dir: Root directory of the agent repository ('.' by default). """ + base_dir = base_dir or agent_working_directory() + return Configuration.from_file(base_dir).modules diff --git a/tasks/libs/common/utils.py b/tasks/libs/common/utils.py index 10882ddfeed7d..44758ae35b543 100644 --- a/tasks/libs/common/utils.py +++ b/tasks/libs/common/utils.py @@ -24,8 +24,8 @@ from invoke.exceptions import Exit from tasks.libs.common.color import Color, color_message -from tasks.libs.common.constants import ALLOWED_REPO_ALL_BRANCHES, DEFAULT_BRANCH, REPO_PATH -from tasks.libs.common.git import get_commit_sha +from tasks.libs.common.constants import ALLOWED_REPO_ALL_BRANCHES, REPO_PATH +from tasks.libs.common.git import get_commit_sha, get_default_branch from tasks.libs.owners.parsing import search_owners from tasks.libs.releasing.version import get_version from tasks.libs.types.arch import Arch @@ -373,6 +373,7 @@ def get_version_ldflags(ctx, major_version='7', install_path=None): Compute the version from the git tags, and set the appropriate compiler flags """ + payload_v = get_payload_version() commit = get_commit_sha(ctx, short=True) @@ -494,8 +495,8 @@ def environ(env): def is_pr_context(branch, pr_id, test_name): - if branch == DEFAULT_BRANCH: - print(f"Running on {DEFAULT_BRANCH}, skipping check for {test_name}.") + if branch == get_default_branch(): + print(f"Running on {get_default_branch()}, skipping check for {test_name}.") return False if not pr_id: print(f"PR not found, skipping check for {test_name}.") @@ -749,3 +750,11 @@ def get_metric_origin(origin_product, origin_sub_product, origin_product_detail, if origin_field: return {"origin": metric_origin} return metric_origin + + +def agent_working_directory(): + """Returns the working directory for the current context (agent 6 / 7).""" + + from tasks.libs.common.worktree import LOCAL_DIRECTORY, WORKTREE_DIRECTORY, is_worktree + + return WORKTREE_DIRECTORY if is_worktree() else LOCAL_DIRECTORY diff --git a/tasks/libs/common/worktree.py b/tasks/libs/common/worktree.py new file mode 100644 index 0000000000000..7e7da54af73df --- /dev/null +++ b/tasks/libs/common/worktree.py @@ -0,0 +1,109 @@ +"""Worktree utilities, used to execute tasks from this local repository (main) to a worktree with a different HEAD (e.g. 6.53.x). + +Common environment variables that can be used: +- WORKTREE_NO_PULL: If set to any value, the worktree will not be pulled before running the command. +""" + +import os +from contextlib import contextmanager +from pathlib import Path + +from invoke.exceptions import Exit + +from tasks.libs.common.color import Color, color_message +from tasks.libs.common.git import get_current_branch + +WORKTREE_DIRECTORY = Path.cwd().parent / "datadog-agent-worktree" +LOCAL_DIRECTORY = Path.cwd().resolve() + + +def init_env(ctx, branch: str | None = None): + """Will prepare the environment for commands applying to a worktree. + + To be used before each worktree section. + Will: + 1. Add the agent worktree if not present. + 2. Fetch the latest changes from the agent worktree. + """ + + if not WORKTREE_DIRECTORY.is_dir(): + if not ctx.run(f"git worktree add '{WORKTREE_DIRECTORY}' origin/{branch or 'main'}", warn=True): + raise Exit( + f'{color_message("Error", Color.RED)}: Cannot initialize worktree environment. You might want to reset the worktree directory with `inv worktree.remove`', + code=1, + ) + + if branch: + worktree_branch = ctx.run( + f"git -C '{WORKTREE_DIRECTORY}' rev-parse --abbrev-ref HEAD", hide=True + ).stdout.strip() + if worktree_branch != branch: + ctx.run(f"git -C '{WORKTREE_DIRECTORY}' checkout '{branch}'", hide=True) + + if not os.environ.get("AGENT_WORKTREE_NO_PULL"): + ctx.run(f"git -C '{WORKTREE_DIRECTORY}' pull", hide=True) + + +def remove_env(ctx): + """Will remove the environment for commands applying to a worktree.""" + + ctx.run(f"git worktree remove -f '{WORKTREE_DIRECTORY}'", warn=True) + + +def is_worktree(): + """Will return True if the current environment is a worktree environment.""" + + return Path.cwd() == WORKTREE_DIRECTORY + + +def enter_env(ctx, branch: str | None, skip_checkout=False): + """Enters the worktree environment.""" + + if not branch: + assert skip_checkout, 'skip_checkout must be set to True if branch is None' + + if not skip_checkout: + init_env(ctx, branch) + else: + assert WORKTREE_DIRECTORY.is_dir(), "Worktree directory is not present and skip_checkout is set to True" + + os.chdir(WORKTREE_DIRECTORY) + if skip_checkout and branch: + current_branch = get_current_branch(ctx) + assert ( + current_branch == branch + ), f"skip_checkout is True but the current branch ({current_branch}) is not {branch}. You should check out the branch before using this command, this can be safely done with `inv worktree.checkout {branch}`." + + +def exit_env(): + """Exits the worktree environment.""" + + os.chdir(LOCAL_DIRECTORY) + + +@contextmanager +def agent_context(ctx, branch: str | None, skip_checkout=False): + """Applies code to the worktree environment if the branch is not None. + + Args: + branch: The branch to switch to. If None, will enter the worktree environment without switching branch (ensures that skip_checkout is True). + skip_checkout: If True, the branch will not be checked out (no pull will be performed too). + + Usage: + > with agent_context(ctx, branch): + > ctx.run("head CHANGELOG.rst") # Displays the changelog of the target branch + """ + + # Do not stack two environments + if is_worktree(): + yield + return + + try: + # Enter + enter_env(ctx, branch, skip_checkout=skip_checkout) + + yield + finally: + # Exit + exit_env() diff --git a/tasks/libs/notify/pipeline_status.py b/tasks/libs/notify/pipeline_status.py index 25915ccc36d82..569973decfb0b 100644 --- a/tasks/libs/notify/pipeline_status.py +++ b/tasks/libs/notify/pipeline_status.py @@ -2,7 +2,7 @@ import re from tasks.libs.ciproviders.gitlab_api import get_commit, get_pipeline -from tasks.libs.common.constants import DEFAULT_BRANCH +from tasks.libs.common.git import get_default_branch from tasks.libs.notify.utils import DEPLOY_PIPELINES_CHANNEL, PIPELINES_CHANNEL, PROJECT_NAME from tasks.libs.pipeline.data import get_failed_jobs from tasks.libs.pipeline.notifications import ( @@ -40,7 +40,7 @@ def send_message(ctx, notification_type, dry_run): # For deploy pipelines not on the main branch, send notifications in a # dedicated channel. slack_channel = PIPELINES_CHANNEL - if notification_type == "deploy" and pipeline.ref != DEFAULT_BRANCH: + if notification_type == "deploy" and pipeline.ref != get_default_branch(): slack_channel = DEPLOY_PIPELINES_CHANNEL header = "" @@ -64,7 +64,7 @@ def send_message(ctx, notification_type, dry_run): else: send_slack_message(slack_channel, str(message)) - if should_send_message_to_author(pipeline.ref, DEFAULT_BRANCH): + if should_send_message_to_author(pipeline.ref, get_default_branch()): author_email = commit.author_email if dry_run: print(f"Would send to {author_email}:\n{str(message)}") diff --git a/tasks/libs/pipeline/tools.py b/tasks/libs/pipeline/tools.py index fc38ce67e883a..7f9d08e8eca6e 100644 --- a/tasks/libs/pipeline/tools.py +++ b/tasks/libs/pipeline/tools.py @@ -13,8 +13,8 @@ from tasks.libs.ciproviders.gitlab_api import refresh_pipeline from tasks.libs.common.color import Color, color_message +from tasks.libs.common.git import get_default_branch from tasks.libs.common.user_interactions import yes_no_question -from tasks.libs.common.utils import DEFAULT_BRANCH PIPELINE_FINISH_TIMEOUT_SEC = 3600 * 5 @@ -115,7 +115,7 @@ def gracefully_cancel_pipeline(repo: Project, pipeline: ProjectPipeline, force_c def trigger_agent_pipeline( repo: Project, - ref=DEFAULT_BRANCH, + ref=None, release_version_6="nightly", release_version_7="nightly-a7", branch="nightly", @@ -134,6 +134,8 @@ def trigger_agent_pipeline( - run a pipeline with all end-to-end tests, - run a deploy pipeline (includes all builds & kitchen tests + uploads artifacts to staging repositories); """ + + ref = ref or get_default_branch() args = {} if deploy: diff --git a/tasks/libs/releasing/json.py b/tasks/libs/releasing/json.py index e0ea089e43fa1..85270b23ce70e 100644 --- a/tasks/libs/releasing/json.py +++ b/tasks/libs/releasing/json.py @@ -7,6 +7,8 @@ from invoke.exceptions import Exit from tasks.libs.common.constants import TAG_FOUND_TEMPLATE +from tasks.libs.common.git import get_default_branch +from tasks.libs.common.worktree import is_worktree from tasks.libs.releasing.documentation import _stringify_config, nightly_entry_for, release_entry_for from tasks.libs.releasing.version import ( VERSION_RE, @@ -41,6 +43,12 @@ "datadog-agent-macos-build": "master", "datadog-agent": "main", } +DEFAULT_BRANCHES_AGENT6 = { + "omnibus-software": "6.53.x", + "omnibus-ruby": "6.53.x", + "datadog-agent-macos-build": "6.53.x", + "datadog-agent": "6.53.x", +} def load_release_json(): @@ -335,8 +343,12 @@ def generate_repo_data(warning_mode, next_version, release_branch): data = {} for repo in repos: branch = release_branch - if branch == "main": - branch = next_version.branch() if repo == "integrations-core" else DEFAULT_BRANCHES.get(repo, "main") + if branch == get_default_branch(): + branch = ( + next_version.branch() + if repo == "integrations-core" + else (DEFAULT_BRANCHES_AGENT6 if is_worktree() else DEFAULT_BRANCHES).get(repo, get_default_branch()) + ) data[repo] = { 'branch': branch, 'previous_tag': previous_tags.get(repo, ""), diff --git a/tasks/libs/releasing/notes.py b/tasks/libs/releasing/notes.py index 10edb3dab145d..d831f372b6197 100644 --- a/tasks/libs/releasing/notes.py +++ b/tasks/libs/releasing/notes.py @@ -2,7 +2,8 @@ from invoke import Failure -from tasks.libs.common.constants import DEFAULT_BRANCH, GITHUB_REPO_NAME +from tasks.libs.common.constants import GITHUB_REPO_NAME +from tasks.libs.common.git import get_default_branch from tasks.libs.releasing.version import current_version @@ -42,7 +43,7 @@ def _add_dca_prelude(ctx, agent7_version, agent6_version=""): f"""prelude: | Released on: {date.today()} - Pinned to datadog-agent v{agent7_version}: `CHANGELOG `_.""" + Pinned to datadog-agent v{agent7_version}: `CHANGELOG `_.""" ) ctx.run(f"git add {new_releasenote}") diff --git a/tasks/linter.py b/tasks/linter.py index 082c3065cc666..f549ae2005075 100644 --- a/tasks/linter.py +++ b/tasks/linter.py @@ -32,8 +32,8 @@ ) from tasks.libs.common.check_tools_version import check_tools_version from tasks.libs.common.color import Color, color_message -from tasks.libs.common.constants import DEFAULT_BRANCH, GITHUB_REPO_NAME -from tasks.libs.common.git import get_file_modifications, get_staged_files +from tasks.libs.common.constants import GITHUB_REPO_NAME +from tasks.libs.common.git import get_default_branch, get_file_modifications, get_staged_files from tasks.libs.common.utils import gitlab_section, is_pr_context, running_in_ci from tasks.libs.owners.parsing import read_owners from tasks.libs.types.copyright import CopyrightLinter, LintFailure @@ -52,7 +52,7 @@ def python(ctx): print( f"""Remember to set up pre-commit to lint your files before committing: - https://github.com/DataDog/datadog-agent/blob/{DEFAULT_BRANCH}/docs/dev/agent_dev_env.md#pre-commit-hooks""" + https://github.com/DataDog/datadog-agent/blob/{get_default_branch()}/docs/dev/agent_dev_env.md#pre-commit-hooks""" ) if running_in_ci(): diff --git a/tasks/modules.py b/tasks/modules.py index e26fe09b82b9f..43f91a2da959c 100644 --- a/tasks/modules.py +++ b/tasks/modules.py @@ -213,6 +213,7 @@ def validate_used_by_otel(ctx: Context): missing_used_by_otel_label: dict[str, list[str]] = defaultdict(list) # for every module labeled as "used_by_otel" + default_modules = get_default_modules() for otel_mod in otel_mods: gomod_path = f"{otel_mod}/go.mod" # get the go.mod data @@ -232,7 +233,7 @@ def validate_used_by_otel(ctx: Context): # we need the relative path of module (without github.com/DataDog/datadog-agent/ prefix) rel_path = require['Path'].removeprefix("github.com/DataDog/datadog-agent/") # check if indirect module is labeled as "used_by_otel" - if rel_path not in get_default_modules() or not get_default_modules()[rel_path].used_by_otel: + if rel_path not in default_modules or not default_modules[rel_path].used_by_otel: missing_used_by_otel_label[rel_path].append(otel_mod) if missing_used_by_otel_label: message = f"{color_message('ERROR', Color.RED)}: some indirect local dependencies of modules labeled \"used_by_otel\" are not correctly labeled in get_default_modules()\n" @@ -260,6 +261,7 @@ def show(_, path: str, remove_defaults: bool = False, base_dir: str = '.'): Args: remove_defaults: If True, will remove default values from the output. + base_dir: Where to load modules from. """ config = Configuration.from_file(Path(base_dir)) @@ -281,6 +283,7 @@ def show_all(_, base_dir: str = '.', ignored=False): """Show the list of modules. Args: + base_dir: Where to load modules from. ignored: If True, will list ignored modules. """ diff --git a/tasks/pipeline.py b/tasks/pipeline.py index 31b728798c334..8d63109fee536 100644 --- a/tasks/pipeline.py +++ b/tasks/pipeline.py @@ -19,8 +19,7 @@ refresh_pipeline, ) from tasks.libs.common.color import Color, color_message -from tasks.libs.common.constants import DEFAULT_BRANCH -from tasks.libs.common.git import get_commit_sha, get_current_branch +from tasks.libs.common.git import get_commit_sha, get_current_branch, get_default_branch from tasks.libs.common.utils import ( get_all_allowed_repo_branches, is_allowed_repo_branch, @@ -94,7 +93,7 @@ def check_deploy_pipeline(repo: Project, git_ref: str, release_version_6, releas @task -def clean_running_pipelines(ctx, git_ref=DEFAULT_BRANCH, here=False, use_latest_sha=False, sha=None): +def clean_running_pipelines(ctx, git_ref=None, here=False, use_latest_sha=False, sha=None): """ Fetch running pipelines on a target ref (+ optionally a git sha), and ask the user if they should be cancelled. @@ -104,6 +103,8 @@ def clean_running_pipelines(ctx, git_ref=DEFAULT_BRANCH, here=False, use_latest_ if here: git_ref = get_current_branch(ctx) + else: + git_ref = git_ref or get_default_branch() print(f"Fetching running pipelines on {git_ref}") @@ -130,11 +131,12 @@ def workflow_rules(gitlab_file=".gitlab-ci.yml"): @task -def trigger(_, git_ref=DEFAULT_BRANCH, release_version_6="dev", release_version_7="dev-a7", repo_branch="dev"): +def trigger(_, git_ref=None, release_version_6="dev", release_version_7="dev-a7", repo_branch="dev"): """ OBSOLETE: Trigger a deploy pipeline on the given git ref. Use pipeline.run with the --deploy option instead. """ + git_ref = git_ref or get_default_branch() use_release_entries = "" major_versions = [] @@ -829,16 +831,16 @@ def test_merge_queue(ctx): # Create a new main and push it print("Creating a new main branch") timestamp = int(datetime.now(timezone.utc).timestamp()) - test_main = f"mq/test_{timestamp}" + test_default = f"mq/test_{timestamp}" current_branch = get_current_branch(ctx) - ctx.run("git checkout main", hide=True) + ctx.run(f"git checkout {get_default_branch()}", hide=True) ctx.run("git pull", hide=True) - ctx.run(f"git checkout -b {test_main}", hide=True) - ctx.run(f"git push origin {test_main}", hide=True) + ctx.run(f"git checkout -b {test_default}", hide=True) + ctx.run(f"git push origin {test_default}", hide=True) # Create a PR towards this new branch and adds it to the merge queue print("Creating a PR and adding it to the merge queue") gh = GithubAPI() - pr = gh.create_pr(f"Test MQ for {current_branch}", "", test_main, current_branch) + pr = gh.create_pr(f"Test MQ for {current_branch}", "", test_default, current_branch) pr.create_issue_comment("/merge") # Search for the generated pipeline print(f"PR {pr.html_url} is waiting for MQ pipeline generation") @@ -848,7 +850,7 @@ def test_merge_queue(ctx): time.sleep(30) pipelines = agent.pipelines.list(per_page=100) try: - pipeline = next(p for p in pipelines if p.ref.startswith(f"mq-working-branch-{test_main}")) + pipeline = next(p for p in pipelines if p.ref.startswith(f"mq-working-branch-{test_default}")) print(f"Pipeline found: {pipeline.web_url}") break except StopIteration as e: @@ -866,8 +868,8 @@ def test_merge_queue(ctx): pipeline.cancel() pr.edit(state="closed") ctx.run(f"git checkout {current_branch}", hide=True) - ctx.run(f"git branch -D {test_main}", hide=True) - ctx.run(f"git push origin :{test_main}", hide=True) + ctx.run(f"git branch -D {test_default}", hide=True) + ctx.run(f"git push origin :{test_default}", hide=True) if not success: raise Exit(message="Merge queue test failed", code=1) diff --git a/tasks/release.py b/tasks/release.py index a82deec90161f..5e0b11483a938 100644 --- a/tasks/release.py +++ b/tasks/release.py @@ -20,7 +20,6 @@ from tasks.libs.ciproviders.gitlab_api import get_gitlab_repo from tasks.libs.common.color import Color, color_message from tasks.libs.common.constants import ( - DEFAULT_BRANCH, GITHUB_REPO_NAME, ) from tasks.libs.common.git import ( @@ -28,8 +27,10 @@ check_clean_branch_state, clone, get_current_branch, + get_default_branch, get_last_commit, get_last_release_tag, + is_agent6, try_git_command, ) from tasks.libs.common.gomodules import get_default_modules @@ -47,6 +48,8 @@ release_manager, ) from tasks.libs.releasing.json import ( + DEFAULT_BRANCHES, + DEFAULT_BRANCHES_AGENT6, UNFREEZE_REPO_AGENT, UNFREEZE_REPOS, _get_release_json_value, @@ -371,7 +374,7 @@ def create_rc(ctx, major_versions="6,7", patch_version=False, upstream="origin", if not check_base_branch(current_branch, new_highest_version): raise Exit( color_message( - f"The branch you are on is neither {DEFAULT_BRANCH} or the correct release branch ({new_highest_version.branch()}). Aborting.", + f"The branch you are on is neither {get_default_branch()} or the correct release branch ({new_highest_version.branch()}). Aborting.", "red", ), code=1, @@ -472,7 +475,7 @@ def build_rc(ctx, major_versions="6,7", patch_version=False, k8s_deployments=Fal if not check_base_branch(current_branch, new_version): raise Exit( color_message( - f"The branch you are on is neither {DEFAULT_BRANCH} or the correct release branch ({new_version.branch()}). Aborting.", + f"The branch you are on is neither {get_default_branch()} or the correct release branch ({new_version.branch()}). Aborting.", "red", ), code=1, @@ -663,7 +666,7 @@ def create_release_branches(ctx, base_directory="~/dd", major_versions="6,7", up create_release_pr( f"[release] Update current milestone to {next}", - "main", + get_default_branch(), milestone_branch, next, ) @@ -683,8 +686,8 @@ def create_release_branches(ctx, base_directory="~/dd", major_versions="6,7", up with open(file, "w") as gl: for line in file_content: - if re.search(r"compare_to: main", line): - gl.write(line.replace("main", f"{release_branch}")) + if re.search(rf"compare_to: {get_default_branch()}", line): + gl.write(line.replace(get_default_branch(), f"{release_branch}")) else: gl.write(line) @@ -754,7 +757,7 @@ def cleanup(ctx): current_milestone = _update_last_stable(ctx, version) # create pull request to update last stable version - main_branch = "main" + main_branch = get_default_branch() cleanup_branch = f"release/{version}-cleanup" ctx.run(f"git checkout -b {cleanup_branch}") ctx.run("git add release.json") @@ -787,9 +790,10 @@ def cleanup(ctx): @task def check_omnibus_branches(ctx): base_branch = _get_release_json_value('base_branch') - if base_branch == 'main': - omnibus_ruby_branch = 'datadog-5.5.0' - omnibus_software_branch = 'master' + if base_branch == get_default_branch(): + default_branches = DEFAULT_BRANCHES_AGENT6 if is_agent6(ctx) else DEFAULT_BRANCHES + omnibus_ruby_branch = default_branches['omnibus-ruby'] + omnibus_software_branch = default_branches['omnibus-software'] else: omnibus_ruby_branch = base_branch omnibus_software_branch = base_branch @@ -913,7 +917,7 @@ def get_active_release_branch(_): if release_branch: print(f"{release_branch.name}") else: - print("main") + print(get_default_branch()) @task diff --git a/tasks/setup.py b/tasks/setup.py index 0426543a16010..5e1eeee98d1f1 100644 --- a/tasks/setup.py +++ b/tasks/setup.py @@ -17,6 +17,7 @@ from tasks import vscode from tasks.libs.common.color import Color, color_message +from tasks.libs.common.git import get_default_branch from tasks.libs.common.status import Status from tasks.libs.common.utils import running_in_pyapp @@ -95,7 +96,7 @@ def check_git_repo(ctx) -> SetupResult: ctx.run("git fetch", hide=True) print(color_message("Checking main branch...", Color.BLUE)) - output = ctx.run('git rev-list "^HEAD" origin/main --count', hide=True) + output = ctx.run(f'git rev-list "^HEAD" origin/{get_default_branch()} --count', hide=True) count = output.stdout.strip() message = "" @@ -103,7 +104,7 @@ def check_git_repo(ctx) -> SetupResult: if count != "0": status = Status.WARN - message = f"Your branch is {count} commit(s) behind main. Please update your branch." + message = f"Your branch is {count} commit(s) behind {get_default_branch()}. Please update your branch." return SetupResult("Check git repository", status, message) diff --git a/tasks/unit_tests/libs/common/worktree_tests.py b/tasks/unit_tests/libs/common/worktree_tests.py new file mode 100644 index 0000000000000..3998357c1accc --- /dev/null +++ b/tasks/unit_tests/libs/common/worktree_tests.py @@ -0,0 +1,97 @@ +import os +import unittest + +from invoke import Context + +from tasks.libs.common.git import get_default_branch +from tasks.libs.common.gomodules import get_default_modules +from tasks.libs.common.worktree import agent_context, init_env, is_worktree + + +def get_ctx(): + return Context() + + +class TestWorktree(unittest.TestCase): + def setUp(self): + # Pull only once + init_env(get_ctx(), '6.53.x') + os.environ['AGENT_WORKTREE_NO_PULL'] = '1' + + def test_context_is_worktree_true(self): + with agent_context(get_ctx(), '6.53.x'): + self.assertTrue(is_worktree()) + + def test_context_is_worktree_false(self): + self.assertFalse(is_worktree()) + + def test_context_nested(self): + with agent_context(get_ctx(), '6.53.x'): + with agent_context(get_ctx(), '6.53.x'): + self.assertTrue(is_worktree()) + self.assertTrue(is_worktree()) + + def test_context_pwd(self): + ctx = get_ctx() + + with agent_context(ctx, None, skip_checkout=True): + pwdnone = ctx.run('pwd').stdout + + with agent_context(ctx, '6.53.x'): + pwd6 = ctx.run('pwd').stdout + + with agent_context(ctx, 'main'): + pwdmain = ctx.run('pwd').stdout + + self.assertEqual(pwd6, pwdnone) + self.assertEqual(pwd6, pwdmain) + + def test_context_modules(self): + ctx = get_ctx() + + with agent_context(ctx, 'main'): + modules7 = get_default_modules() + + with agent_context(ctx, '6.53.x'): + modules6 = get_default_modules() + + self.assertNotEqual(set(modules6.keys()), set(modules7.keys())) + + def test_context_branch(self): + ctx = get_ctx() + + with agent_context(ctx, 'main'): + branch7 = get_default_branch() + + with agent_context(ctx, '6.53.x'): + branch6 = get_default_branch() + + self.assertNotEqual(branch6, branch7) + + def test_context_no_checkout(self): + ctx = get_ctx() + + with agent_context(ctx, '6.53.x'): + branch6 = get_default_branch() + + with agent_context(ctx, 'main'): + branch7 = get_default_branch() + + with agent_context(ctx, 'main', skip_checkout=True): + branch_no_checkout = get_default_branch() + + self.assertNotEqual(branch6, branch7) + self.assertEqual(branch7, branch_no_checkout) + + def test_context_no_checkout_error(self): + ctx = get_ctx() + + with agent_context(ctx, '6.53.x'): + pass + + def switch_context(): + # The current branch is not main + with agent_context(ctx, 'main', skip_checkout=True): + pass + + self.assertRaises(AssertionError, switch_context) diff --git a/tasks/worktree.py b/tasks/worktree.py new file mode 100644 index 0000000000000..bb00ed751b370 --- /dev/null +++ b/tasks/worktree.py @@ -0,0 +1,88 @@ +from invoke import task +from invoke.exceptions import Exit + +from tasks.libs.common.color import Color, color_message +from tasks.libs.common.user_interactions import yes_no_question +from tasks.libs.common.worktree import WORKTREE_DIRECTORY, agent_context, enter_env, init_env, remove_env + + +@task +def init(ctx, branch: str | None = None): + """Will prepare the worktree context (git clone / pull of the agent branch).""" + + init_env(ctx, branch) + + +@task +def remove(ctx): + """Will remove the git worktree context.""" + + remove_env(ctx) + + +@task +def status(ctx): + """Displays the status of the worktree environment.""" + + if not WORKTREE_DIRECTORY.is_dir(): + raise Exit('No worktree environment found.') + + ctx.run(f"git -C '{WORKTREE_DIRECTORY}' status", pty=True) + + +@task +def checkout(ctx, ref): + """Changes the worktree environment to the specified ref. + + Note: + This won't pull. + """ + + if not WORKTREE_DIRECTORY.is_dir(): + raise Exit('No worktree environment found.') + + ctx.run(f"git -C '{WORKTREE_DIRECTORY}' checkout '{ref}'", pty=True) + + +@task +def pull(ctx): + """Pulls the worktree environment.""" + + if not WORKTREE_DIRECTORY.is_dir(): + raise Exit('No worktree environment found.') + + ctx.run(f"git -C '{WORKTREE_DIRECTORY}' pull", pty=True) + + +@task +def run(ctx, branch: str, command: str, skip_checkout: bool = False): + """Runs a command in the target worktree environment. + + Usage: + $ inv worktree.run 6.53.x "head CHANGELOG.rst" # Displays the changelog of the target branch + """ + + with agent_context(ctx, branch, skip_checkout=skip_checkout): + ctx.run(command) + + +@task +def invoke(ctx, branch: str, skip_checkout: bool = False, yes: bool = False): + """Enters the worktree environment in order to invoke tasks in this context. + + Note: + This task should be avoided when a --branch or --release-branch argument is available in the task. + + Usage: + > inv worktree.invoke 6.53.x --yes modules.show-all # Will show agent 6 modules + """ + + if yes or yes_no_question( + 'Warning: This task should be avoided, use --branch or --release-branch argument if available in the task. Want to proceed?', + color=Color.ORANGE, + default=False, + ): + # The tasks running after this one will be using the agent 6 environment + enter_env(ctx, branch, skip_checkout=skip_checkout) + else: + raise Exit(color_message('Aborted.', Color.RED))