diff --git a/CHANGELOG.md b/CHANGELOG.md index 44099637..fc45f824 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ All notable changes to this project will be documented in this file. - The lifetime of auto generated TLS certificates is now configurable with the role and roleGroup config property `requestedSecretLifetime`. This helps reducing frequent Pod restarts ([#660]). +- Run a `containerdebug` process in the background of each "druid" container to collect debugging information ([#667]). ### Fixed @@ -19,6 +20,7 @@ All notable changes to this project will be documented in this file. [#656]: https://github.com/stackabletech/druid-operator/pull/656 [#657]: https://github.com/stackabletech/druid-operator/pull/657 [#660]: https://github.com/stackabletech/druid-operator/pull/660 +[#667]: https://github.com/stackabletech/druid-operator/pull/667 ## [24.11.0] - 2024-11-18 diff --git a/rust/crd/src/lib.rs b/rust/crd/src/lib.rs index 9ee776d1..5a4929a8 100644 --- a/rust/crd/src/lib.rs +++ b/rust/crd/src/lib.rs @@ -75,7 +75,7 @@ pub const JVM_SECURITY_PROPERTIES_FILE: &str = "security.properties"; pub const STACKABLE_TRUST_STORE: &str = "/stackable/truststore.p12"; pub const STACKABLE_TRUST_STORE_PASSWORD: &str = "changeit"; pub const CERTS_DIR: &str = "/stackable/certificates"; -pub const LOG_DIR: &str = "/stackable/log"; +pub const STACKABLE_LOG_DIR: &str = "/stackable/log"; // store file names pub const DRUID_LOG_FILE: &str = "druid.log4j2.xml"; @@ -604,6 +604,7 @@ impl DruidRole { {COMMON_BASH_TRAP_FUNCTIONS} {remove_vector_shutdown_file_command} prepare_signal_handlers + CONTAINERDEBUG_LOG_DIRECTORY={STACKABLE_LOG_DIR}/containerdebug containerdebug --output={STACKABLE_LOG_DIR}/containerdebug-state.json --loop & /stackable/druid/bin/run-druid {process_name} {RW_CONFIG_DIRECTORY} & echo \"$!\" >> /tmp/DRUID_PID wait_for_termination $(cat /tmp/DRUID_PID) @@ -611,9 +612,9 @@ impl DruidRole { ", process_name = self.get_process_name(), remove_vector_shutdown_file_command = - remove_vector_shutdown_file_command(LOG_DIR), + remove_vector_shutdown_file_command(STACKABLE_LOG_DIR), create_vector_shutdown_file_command = - create_vector_shutdown_file_command(LOG_DIR), + create_vector_shutdown_file_command(STACKABLE_LOG_DIR), } } } diff --git a/rust/operator-binary/src/druid_controller.rs b/rust/operator-binary/src/druid_controller.rs index 1ed6a666..f7401236 100644 --- a/rust/operator-binary/src/druid_controller.rs +++ b/rust/operator-binary/src/druid_controller.rs @@ -17,9 +17,9 @@ use stackable_druid_crd::{ Container, DeepStorageSpec, DruidCluster, DruidClusterStatus, DruidRole, APP_NAME, AUTH_AUTHORIZER_OPA_URI, CREDENTIALS_SECRET_PROPERTY, DB_PASSWORD_ENV, DB_USERNAME_ENV, DRUID_CONFIG_DIRECTORY, DS_BUCKET, EXTENSIONS_LOADLIST, HDFS_CONFIG_DIRECTORY, JVM_CONFIG, - JVM_SECURITY_PROPERTIES_FILE, LOG_CONFIG_DIRECTORY, LOG_DIR, MAX_DRUID_LOG_FILES_SIZE, - RUNTIME_PROPS, RW_CONFIG_DIRECTORY, S3_ACCESS_KEY, S3_ENDPOINT_URL, S3_PATH_STYLE_ACCESS, - S3_SECRET_KEY, ZOOKEEPER_CONNECTION_STRING, + JVM_SECURITY_PROPERTIES_FILE, LOG_CONFIG_DIRECTORY, MAX_DRUID_LOG_FILES_SIZE, RUNTIME_PROPS, + RW_CONFIG_DIRECTORY, S3_ACCESS_KEY, S3_ENDPOINT_URL, S3_PATH_STYLE_ACCESS, S3_SECRET_KEY, + STACKABLE_LOG_DIR, ZOOKEEPER_CONNECTION_STRING, }; use stackable_operator::{ builder::{ @@ -964,7 +964,7 @@ fn build_rolegroup_statefulset( // This command needs to be added at the beginning of the shell commands, // otherwise the output of the following commands will not be captured! prepare_container_commands.push(product_logging::framework::capture_shell_output( - LOG_DIR, + STACKABLE_LOG_DIR, &prepare_container_name, log_config, )); @@ -1292,10 +1292,10 @@ fn add_log_volume_and_volume_mounts( pb: &mut PodBuilder, ) -> Result<()> { cb_druid - .add_volume_mount(LOG_VOLUME_NAME, LOG_DIR) + .add_volume_mount(LOG_VOLUME_NAME, STACKABLE_LOG_DIR) .context(AddVolumeMountSnafu)?; cb_prepare - .add_volume_mount(LOG_VOLUME_NAME, LOG_DIR) + .add_volume_mount(LOG_VOLUME_NAME, STACKABLE_LOG_DIR) .context(AddVolumeMountSnafu)?; pb.add_volume( VolumeBuilder::new(LOG_VOLUME_NAME) diff --git a/rust/operator-binary/src/product_logging.rs b/rust/operator-binary/src/product_logging.rs index aee15aad..22512884 100644 --- a/rust/operator-binary/src/product_logging.rs +++ b/rust/operator-binary/src/product_logging.rs @@ -1,6 +1,7 @@ use snafu::{OptionExt, ResultExt, Snafu}; use stackable_druid_crd::{ - Container, DruidCluster, DRUID_LOG_FILE, LOG4J2_CONFIG, LOG_DIR, MAX_DRUID_LOG_FILES_SIZE, + Container, DruidCluster, DRUID_LOG_FILE, LOG4J2_CONFIG, MAX_DRUID_LOG_FILES_SIZE, + STACKABLE_LOG_DIR, }; use stackable_operator::{ builder::configmap::ConfigMapBuilder, @@ -90,7 +91,10 @@ pub fn extend_role_group_config_map( cm_builder.add_data( LOG4J2_CONFIG, product_logging::framework::create_log4j2_config( - &format!("{LOG_DIR}/{container}", container = Container::Druid), + &format!( + "{STACKABLE_LOG_DIR}/{container}", + container = Container::Druid + ), DRUID_LOG_FILE, MAX_DRUID_LOG_FILES_SIZE .scale_to(BinaryMultiple::Mebi) diff --git a/tests/templates/kuttl/smoke/50-assert.yaml b/tests/templates/kuttl/smoke/50-assert.yaml index a63f3bfa..32ae0af1 100644 --- a/tests/templates/kuttl/smoke/50-assert.yaml +++ b/tests/templates/kuttl/smoke/50-assert.yaml @@ -135,3 +135,14 @@ status: expectedPods: 1 currentHealthy: 1 disruptionsAllowed: 1 +--- +# This test checks if the containerdebug-state.json file is present and valid +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 600 +commands: + - script: kubectl exec -n $NAMESPACE --container druid druid-coordinator-default-0 -- cat /stackable/log/containerdebug-state.json | jq --exit-status + - script: kubectl exec -n $NAMESPACE --container druid druid-router-default-0 -- cat /stackable/log/containerdebug-state.json | jq --exit-status + - script: kubectl exec -n $NAMESPACE --container druid druid-middlemanager-default-0 -- cat /stackable/log/containerdebug-state.json | jq --exit-status + - script: kubectl exec -n $NAMESPACE --container druid druid-router-default-0 -- cat /stackable/log/containerdebug-state.json | jq --exit-status + - script: kubectl exec -n $NAMESPACE --container druid druid-historical-default-0 -- cat /stackable/log/containerdebug-state.json | jq --exit-status