diff --git a/build.sh b/build.sh index c8aa2bf2c43626..a457fb58404bfa 100755 --- a/build.sh +++ b/build.sh @@ -885,6 +885,9 @@ if [[ ${BUILD_CLOUD} -eq 1 ]]; then cp -r -p "${DORIS_HOME}/cloud/output" "${DORIS_HOME}/output/ms" fi +mkdir -p ${DORIS_HOME}/output/tools +cp -r -p tools/fdb ${DORIS_HOME}/output/tools + echo "***************************************" echo "Successfully build Doris" echo "***************************************" diff --git a/tools/fdb/fdb_ctl.sh b/tools/fdb/fdb_ctl.sh index 9c809abd5d4a50..daeddae25260ec 100755 --- a/tools/fdb/fdb_ctl.sh +++ b/tools/fdb/fdb_ctl.sh @@ -77,7 +77,7 @@ function ensure_port_is_listenable() { function download_fdb() { if [[ -d "${FDB_PKG_DIR}" ]]; then - echo "FDB ${FDB_VERSION} already exists" + echo "FDB package for ${FDB_VERSION} already exists" return fi @@ -135,8 +135,8 @@ get_fdb_mode() { # Function to calculate number of processes calculate_process_numbers() { - # local memory_gb=$1 - local cpu_cores=$2 + local memory_limit_gb=$1 + local cpu_cores_limit=$2 local min_processes=1 local data_dir_count @@ -145,27 +145,87 @@ calculate_process_numbers() { IFS=',' read -r -a DATA_DIR_ARRAY <<<"${DATA_DIRS}" data_dir_count=${#DATA_DIR_ARRAY[@]} - # Stateless processes (at least 1, up to 1/4 of CPU cores) - local stateless_processes=$((cpu_cores / 4)) - [[ ${stateless_processes} -lt ${min_processes} ]] && stateless_processes=${min_processes} + # Parse the ratio input + IFS=':' read -r num_storage num_stateless num_log <<< "$STORAGE_STATELESS_LOG_RATIO" - # Storage processes (must be a multiple of the number of data directories) - local storage_processes=$((cpu_cores / 4)) - [[ ${storage_processes} -lt ${data_dir_count} ]] && storage_processes=${data_dir_count} - storage_processes=$(((storage_processes / data_dir_count) * data_dir_count)) + # Initialize process counts + local storage_processes=0 # Storage processes + local stateless_processes=0 # Stateless processes + local log_processes=0 # Log processes - # Transaction processes (must be a multiple of the number of data directories) - local transaction_processes=$((cpu_cores / 8)) - [[ ${transaction_processes} -lt ${min_processes} ]] && transaction_processes=${min_processes} - [[ ${transaction_processes} -lt ${data_dir_count} ]] && transaction_processes=${data_dir_count} - transaction_processes=$(((transaction_processes / data_dir_count) * data_dir_count)) + local storage_process_num_limit=$(( STORAGE_PROCESSES_NUM_PER_SSD * data_dir_count )) + local log_process_num_limit=$(( LOG_PROCESSES_NUM_PER_SSD * data_dir_count )) + + if [ "#$MEDIUM_TYPE" = "#HDD" ]; then + storage_process_num_limit=$(( STORAGE_PROCESSES_NUM_PER_HDD * data_dir_count )) + log_process_num_limit=$(( LOG_PROCESSES_NUM_PER_HDD * data_dir_count)) + fi + + # Find maximum number of processes while maintaining the specified ratio + while true; do + # Calculate process counts based on the ratio + storage_processes=$(( storage_processes + num_storage )) + stateless_processes=$(( storage_processes * num_stateless / num_storage )) + log_processes=$(( storage_processes * num_log / num_storage )) + + # Calculate total CPUs used + local total_cpu_used=$(( storage_processes + stateless_processes + log_processes )) + + # Check memory constraint + local total_memory_used=$(( (MEMORY_STORAGE_GB * storage_processes) + (MEMORY_STATELESS_GB * stateless_processes) + (MEMORY_LOG_GB * log_processes) )) + + # Check datadir limits + if ((storage_processes > storage_process_num_limit || log_processes > log_process_num_limit )); then + break + fi + + # Check overall constraints + if (( total_memory_used <= memory_limit_gb && total_cpu_used <= cpu_cores_limit )); then + continue + else + # If constraints are violated, revert back + storage_processes=$(( storage_processes - num_storage )) + stateless_processes=$(( storage_processes * num_stateless / num_storage )) + log_processes=$((storage_processes * num_log / num_storage )) + break + fi + done # Return the values - echo "${stateless_processes} ${storage_processes} ${transaction_processes}" + echo "${stateless_processes} ${storage_processes} ${log_processes}" +} + +function check_vars() { + IFS=',' read -r -a IPS <<<"$FDB_CLUSTER_IPS" + + if [[ -z $(which ping) ]]; then + echo "ping is not available to check machines are available, please install ping." + fi + + for IP_ADDRESS in "${IPS[@]}"; do + if ping -c 1 "${IP_ADDRESS}" &> /dev/null; then + echo "${IP_ADDRESS} is reachable" + else + echo "${IP_ADDRESS} is not reachable" + exit 1 + fi + done + + if [ ${CPU_CORES_LIMIT} -gt $(nproc) ]; then + echo "CPU_CORES_LIMIT beyonds number of machine, which is $(nproc)" + exit 1 + fi + + if [ ${MEMORY_LIMIT_GB} -gt $(free -g | awk '/^Mem:/{print $2}') ]; then + echo "MEMORY_LIMIT_GB beyonds memory of machine, which is $(free -g | awk '/^Mem:/{print $2}')" + exit 1 + fi } function deploy_fdb() { + check_vars download_fdb + check_fdb_running ln -sf "${FDB_PKG_DIR}/fdbserver" "${FDB_HOME}/fdbserver" ln -sf "${FDB_PKG_DIR}/fdbmonitor" "${FDB_HOME}/fdbmonitor" @@ -178,6 +238,10 @@ function deploy_fdb() { IFS=',' read -r -a DATA_DIR_ARRAY <<<"${DATA_DIRS}" for DIR in "${DATA_DIR_ARRAY[@]}"; do mkdir -p "${DIR}" || handle_error "Failed to create data directory ${DIR}" + if [ ! -z "$(ls -A ${DIR})" ]; then + echo "Error: ${DIR} is not empty. DO NOT run deploy on a node running fdb. If you are sure that the node is not in a fdb cluster, run fdb_ctl.sh clean." + exit 1 + fi done echo -e "\tCreate fdb.cluster, coordinator: $(get_coordinators)" @@ -210,7 +274,14 @@ EOF CPU_CORES_LIMIT=${CPU_CORES_LIMIT:-1} # Calculate number of processes based on resources and data directories - read -r stateless_processes storage_processes transaction_processes <<<"$(calculate_process_numbers "${MEMORY_LIMIT_GB}" "${CPU_CORES_LIMIT}")" + read -r stateless_processes storage_processes log_processes <<<"$(calculate_process_numbers "${MEMORY_LIMIT_GB}" "${CPU_CORES_LIMIT}")" + echo "stateless process num : ${stateless_processes}, storage_processes : ${storage_processes}, log_processes : ${log_processes}" + if [ $storage_processes -eq 0 ]; then + # Add one process + PORT=$((FDB_PORT)) + echo "[fdbserver.${PORT}] +" >>"${FDB_HOME}/conf/fdb.conf" + fi # Add stateless processes for ((i = 0; i < stateless_processes; i++)); do @@ -233,12 +304,12 @@ datadir = ${DATA_DIR_ARRAY[${DIR_INDEX}]}/${PORT}" | tee -a "${FDB_HOME}/conf/fd FDB_PORT=$((FDB_PORT + storage_processes)) - # Add transaction processes - for ((i = 0; i < transaction_processes; i++)); do + # Add log processes + for ((i = 0; i < log_processes; i++)); do PORT=$((FDB_PORT + i)) DIR_INDEX=$((i % STORAGE_DIR_COUNT)) echo "[fdbserver.${PORT}] -class = transaction +class = log datadir = ${DATA_DIR_ARRAY[${DIR_INDEX}]}/${PORT}" | tee -a "${FDB_HOME}/conf/fdb.conf" >/dev/null done @@ -250,6 +321,8 @@ logdir = ${LOG_DIR}" >>"${FDB_HOME}/conf/fdb.conf" } function start_fdb() { + check_fdb_running + if [[ ! -f "${FDB_HOME}/fdbmonitor" ]]; then echo 'Please run setup before start fdb server' exit 1 @@ -275,6 +348,18 @@ function stop_fdb() { fi } +function check_fdb_running() { + if [[ -f "${FDB_HOME}/fdbmonitor.pid" ]]; then + local fdb_pid + + fdb_pid=$(cat "${FDB_HOME}/fdbmonitor.pid") + if ps -p "${fdb_pid}" >/dev/null; then + echo "fdbmonitor with pid ${fdb_pid} is running, stop it first." + exit 1 + fi + fi +} + function clean_fdb() { if [[ -f "${FDB_HOME}/fdbmonitor.pid" ]]; then local fdb_pid @@ -307,8 +392,6 @@ function clean_fdb() { function deploy() { local job="$1" - local skip_pkg="$2" - local skip_config="$3" if [[ ${job} =~ ^(all|fdb)$ ]]; then deploy_fdb @@ -324,16 +407,21 @@ function start() { fi if [[ ${init} =~ ^(all|fdb)$ ]]; then - echo "Try create database ..." local fdb_mode fdb_mode=$(get_fdb_mode) + + echo "Try create database in fdb ${fdb_mode}" + "${FDB_HOME}/fdbcli" -C "${FDB_HOME}/conf/fdb.cluster" \ - --exec "configure new ${fdb_mode} ssd" || true + --exec "configure new ${fdb_mode} ssd" || \ + "${FDB_HOME}/fdbcli" -C "${FDB_HOME}/conf/fdb.cluster" --exec "status" || \ + (echo "failed to start fdb, please check that all nodes have same FDB_CLUSTER_ID" && \ + exit 1) fi - echo "Start fdb success, and the cluster is:" - cat "${FDB_HOME}/conf/fdb.cluster" + echo "Start fdb success, and you can set conf for MetaService:" + echo "fdb_cluster = $(cat ${FDB_HOME}/conf/fdb.cluster)" } function stop() { @@ -359,16 +447,12 @@ function status() { } function usage() { - echo "Usage: $0 [--skip-pkg] [--skip-config]" + echo "Usage: $0 " echo -e "\t deploy \t setup fdb env (dir, binary, conf ...)" echo -e "\t clean \t clean fdb data" echo -e "\t start \t start fdb" echo -e "\t stop \t stop fdb" - echo -e "" - echo -e "" - echo -e "Args:" - echo -e "\t --skip-pkg \t skip to update binary pkgs during deploy" - echo -e "\t --skip-config \t skip to update config during deploy" + echo -e "\t fdbcli \t stop fdb" echo -e "" exit 1 } @@ -390,12 +474,10 @@ shift job="fdb" init="fdb" -skip_pkg="false" -skip_config="false" case ${cmd} in deploy) - deploy "${job}" "${skip_pkg}" "${skip_config}" + deploy "${job}" ;; start) start "${job}" "${init}" diff --git a/tools/fdb/fdb_vars.sh b/tools/fdb/fdb_vars.sh index c0bbadabdd6cd1..d44c5d5dff40d7 100644 --- a/tools/fdb/fdb_vars.sh +++ b/tools/fdb/fdb_vars.sh @@ -25,13 +25,15 @@ # shellcheck disable=2034 DATA_DIRS="/mnt/foundationdb/data1,/mnt/foundationdb/data2,/mnt/foundationdb/data3" +MEDIUM_TYPE="SSD" + # Define the cluster IPs (comma-separated list of IP addresses) # You should have at least 3 IP addresses for a production cluster # The first IP addresses will be used as the coordinator, # num of coordinators depends on the number of nodes, see the function get_coordinators. # For high availability, machines should be in diffrent rack. # shellcheck disable=2034 -FDB_CLUSTER_IPS="172.200.0.2,172.200.0.3,172.200.0.4" +FDB_CLUSTER_IPS="172.200.0.5,172.200.0.6,172.200.0.7" # Define the FoundationDB home directory, which contains the fdb binaries and logs. # default is /fdbhome and have to be absolute path. @@ -41,23 +43,23 @@ FDB_HOME="/fdbhome" # Define the cluster id, shoule be generated random like mktemp -u XXXXXXXX, # have to be different for each cluster. # shellcheck disable=2034 -FDB_CLUSTER_ID=$(mktemp -u XXXXXXXX) +FDB_CLUSTER_ID="ra7eOp7x" # Define the cluster description, you 'd better to change it. # shellcheck disable=2034 FDB_CLUSTER_DESC="mycluster" -#======================= OPTIONAL CUSTOMIZATION ============================ # Define resource limits # Memory limit in gigabytes # shellcheck disable=2034 -MEMORY_LIMIT_GB=16 +MEMORY_LIMIT_GB=64 # CPU cores limit # shellcheck disable=2034 -CPU_CORES_LIMIT=8 +CPU_CORES_LIMIT=16 + +#======================= OPTIONAL CUSTOMIZATION ============================ -#=========================================================================== # Define starting port for the servers # This is the base port number for the fdbserver processes, usually does not need to be changed # shellcheck disable=2034 @@ -70,3 +72,24 @@ FDB_VERSION="7.1.38" # Users who run the fdb processes, default is the current user # shellcheck disable=2034 USER=$(whoami) + +# ratio of storage, stateless and log process num in fdb +# shellcheck disable=2034 +STORAGE_STATELESS_LOG_RATIO="2:1:1" + +# Set process limits +# shellcheck disable=2034 +STORAGE_PROCESSES_NUM_PER_HDD=1 +# shellcheck disable=2034 +LOG_PROCESSES_NUM_PER_HDD=1 +# shellcheck disable=2034 +STORAGE_PROCESSES_NUM_PER_SSD=4 +# shellcheck disable=2034 +LOG_PROCESSES_NUM_PER_SSD=4 + +# shellcheck disable=2034 +MEMORY_STORAGE_GB=8 +# shellcheck disable=2034 +MEMORY_STATELESS_GB=1 +# shellcheck disable=2034 +MEMORY_LOG_GB=2 \ No newline at end of file