From d3c3d8b8e401d239416f323c69ceb12c67e3c26d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 11 Mar 2023 16:48:40 +0100 Subject: [PATCH 01/15] Remove export of dynamic symbols --- CMakeLists.txt | 10 ++++++++-- programs/library-bridge/CMakeLists.txt | 4 ---- programs/odbc-bridge/CMakeLists.txt | 6 ------ 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 263b202049b3..3283ca52ca7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -155,8 +155,14 @@ elseif(GLIBC_COMPATIBILITY) message (${RECONFIGURE_MESSAGE_LEVEL} "Glibc compatibility cannot be enabled in current configuration") endif () -# Make sure the final executable has symbols exported -set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic") +if (OS_LINUX) + # We should not export dynamic symbols, because: + # - The main clickhouse binary does not use dlopen, + # and whatever is poisoning it by LD_PRELOAD should not link to our symbols. + # - The clickhouse-odbc-bridge and clickhouse-library-bridge binaries + # should not expose their symbols to ODBC drivers and libraries. + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-export-dynamic") +endif () if (OS_DARWIN) # The `-all_load` flag forces loading of all symbols from all libraries, diff --git a/programs/library-bridge/CMakeLists.txt b/programs/library-bridge/CMakeLists.txt index 1cacc391ca52..dd0bf67cb64c 100644 --- a/programs/library-bridge/CMakeLists.txt +++ b/programs/library-bridge/CMakeLists.txt @@ -13,10 +13,6 @@ set (CLICKHOUSE_LIBRARY_BRIDGE_SOURCES library-bridge.cpp ) -if (OS_LINUX) - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-export-dynamic") -endif () - clickhouse_add_executable(clickhouse-library-bridge ${CLICKHOUSE_LIBRARY_BRIDGE_SOURCES}) target_link_libraries(clickhouse-library-bridge PRIVATE diff --git a/programs/odbc-bridge/CMakeLists.txt b/programs/odbc-bridge/CMakeLists.txt index 118610e4dcdb..56373601b959 100644 --- a/programs/odbc-bridge/CMakeLists.txt +++ b/programs/odbc-bridge/CMakeLists.txt @@ -15,12 +15,6 @@ set (CLICKHOUSE_ODBC_BRIDGE_SOURCES validateODBCConnectionString.cpp ) -if (OS_LINUX) - # clickhouse-odbc-bridge is always a separate binary. - # Reason: it must not export symbols from SSL, mariadb-client, etc. to not break ABI compatibility with ODBC drivers. - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-export-dynamic") -endif () - clickhouse_add_executable(clickhouse-odbc-bridge ${CLICKHOUSE_ODBC_BRIDGE_SOURCES}) target_link_libraries(clickhouse-odbc-bridge PRIVATE From 5a3281bb4912dce6a6125681b0804b97653da763 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 11 Mar 2023 19:30:03 +0100 Subject: [PATCH 02/15] Remove unused code --- src/Common/getResource.cpp | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/src/Common/getResource.cpp b/src/Common/getResource.cpp index fe603fcc5502..09777640dd94 100644 --- a/src/Common/getResource.cpp +++ b/src/Common/getResource.cpp @@ -1,6 +1,4 @@ #include "getResource.h" -#include -#include #include #include @@ -14,39 +12,6 @@ std::string_view getResource(std::string_view name) std::replace(name_replaced.begin(), name_replaced.end(), '.', '_'); boost::replace_all(name_replaced, "+", "_PLUS_"); -#if defined USE_MUSL /// If static linking is used, we cannot use dlsym and have to parse ELF symbol table by ourself. return DB::SymbolIndex::instance()->getResource(name_replaced); - -#else - // In most `dlsym(3)` APIs, one passes the symbol name as it appears via - // something like `nm` or `objdump -t`. For example, a symbol `_foo` would be - // looked up with the string `"_foo"`. - // - // Apple's linker is confusingly different. The NOTES on the man page for - // `dlsym(3)` claim that one looks up the symbol with "the name used in C - // source code". In this example, that would mean using the string `"foo"`. - // This apparently applies even in the case where the symbol did not originate - // from C source, such as the embedded binary resource files used here. So - // the symbol name must not have a leading `_` on Apple platforms. It's not - // clear how this applies to other symbols, such as those which _have_ a leading - // underscore in them by design, many leading underscores, etc. -#if defined OS_DARWIN - std::string prefix = "binary_"; -#else - std::string prefix = "_binary_"; -#endif - std::string symbol_name_start = prefix + name_replaced + "_start"; - std::string symbol_name_end = prefix + name_replaced + "_end"; - - const char * sym_start = reinterpret_cast(dlsym(RTLD_DEFAULT, symbol_name_start.c_str())); - const char * sym_end = reinterpret_cast(dlsym(RTLD_DEFAULT, symbol_name_end.c_str())); - - if (sym_start && sym_end) - { - auto resource_size = static_cast(std::distance(sym_start, sym_end)); - return { sym_start, resource_size }; - } - return {}; -#endif } From 8cd9fc4a2d6936343c2be119d8fdd61986cdd77f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 11 Mar 2023 21:20:20 +0100 Subject: [PATCH 03/15] Fix build --- docker/packager/binary/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index 2cd0a011013e..c7f31e13287b 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -62,7 +62,7 @@ then ninja $NINJA_FLAGS clickhouse-keeper ls -la ./programs/ - ldd ./programs/clickhouse-keeper + ldd ./programs/clickhouse-keeper ||: if [ -n "$MAKE_DEB" ]; then # No quotes because I want it to expand to nothing if empty. From 03845ba9c5fb1ccb03330059369641751f1c9ab1 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 7 May 2023 00:16:23 +0200 Subject: [PATCH 04/15] Fix MSan. --- src/Common/SymbolIndex.cpp | 77 ++++++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/src/Common/SymbolIndex.cpp b/src/Common/SymbolIndex.cpp index f1cace5017c2..79f97e93a2f1 100644 --- a/src/Common/SymbolIndex.cpp +++ b/src/Common/SymbolIndex.cpp @@ -9,7 +9,6 @@ #include -//#include #include #include @@ -63,9 +62,11 @@ Otherwise you will get only exported symbols from program headers. #endif #define __msan_unpoison_string(X) // NOLINT +#define __msan_unpoison(X, Y) // NOLINT #if defined(ch_has_feature) # if ch_has_feature(memory_sanitizer) # undef __msan_unpoison_string +# undef __msan_unpoison # include # endif #endif @@ -136,10 +137,12 @@ void collectSymbolsFromProgramHeaders( /* Iterate over all headers of the current shared lib * (first call is for the executable itself) */ + __msan_unpoison(&info->dlpi_phnum, sizeof(info->dlpi_phnum)); for (size_t header_index = 0; header_index < info->dlpi_phnum; ++header_index) { /* Further processing is only needed if the dynamic section is reached */ + __msan_unpoison(&info->dlpi_phdr[header_index], sizeof(info->dlpi_phdr[header_index])); if (info->dlpi_phdr[header_index].p_type != PT_DYNAMIC) continue; @@ -160,44 +163,53 @@ void collectSymbolsFromProgramHeaders( */ size_t sym_cnt = 0; - for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it) { - ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); - - // TODO: this branch leads to invalid address of the hash table. Need further investigation. - // if (it->d_tag == DT_HASH) - // { - // const ElfW(Word) * hash = reinterpret_cast(base_address); - // sym_cnt = hash[1]; - // break; - // } - if (it->d_tag == DT_GNU_HASH) + const auto * it = dyn_begin; + while (true) { - /// This code based on Musl-libc. + __msan_unpoison(it, sizeof(*it)); + if (it->d_tag != DT_NULL) + break; - const uint32_t * buckets = nullptr; - const uint32_t * hashval = nullptr; + ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); - const ElfW(Word) * hash = reinterpret_cast(base_address); + if (it->d_tag == DT_GNU_HASH) + { + /// This code based on Musl-libc. - buckets = hash + 4 + (hash[2] * sizeof(size_t) / 4); + const uint32_t * buckets = nullptr; + const uint32_t * hashval = nullptr; - for (ElfW(Word) i = 0; i < hash[0]; ++i) - if (buckets[i] > sym_cnt) - sym_cnt = buckets[i]; + const ElfW(Word) * hash = reinterpret_cast(base_address); - if (sym_cnt) - { - sym_cnt -= hash[1]; - hashval = buckets + hash[0] + sym_cnt; - do + __msan_unpoison(&hash[0], sizeof(*hash)); + __msan_unpoison(&hash[1], sizeof(*hash)); + __msan_unpoison(&hash[2], sizeof(*hash)); + + buckets = hash + 4 + (hash[2] * sizeof(size_t) / 4); + + __msan_unpoison(buckets, hash[0] * sizeof(buckets[0])); + + for (ElfW(Word) i = 0; i < hash[0]; ++i) + if (buckets[i] > sym_cnt) + sym_cnt = buckets[i]; + + if (sym_cnt) { - ++sym_cnt; + sym_cnt -= hash[1]; + hashval = buckets + hash[0] + sym_cnt; + __msan_unpoison(&hashval, sizeof(hashval)); + do + { + ++sym_cnt; + } + while (!(*hashval++ & 1)); } - while (!(*hashval++ & 1)); + + break; } - break; + ++it; } } @@ -228,6 +240,8 @@ void collectSymbolsFromProgramHeaders( /* Get the pointer to the first entry of the symbol table */ const ElfW(Sym) * elf_sym = reinterpret_cast(base_address); + __msan_unpoison(elf_sym, sym_cnt * sizeof(*elf_sym)); + /* Iterate over the symbol table */ for (ElfW(Word) sym_index = 0; sym_index < ElfW(Word)(sym_cnt); ++sym_index) { @@ -235,6 +249,7 @@ void collectSymbolsFromProgramHeaders( * This is located at the address of st_name relative to the beginning of the string table. */ const char * sym_name = &strtab[elf_sym[sym_index].st_name]; + __msan_unpoison_string(sym_name); if (!sym_name) continue; @@ -264,13 +279,17 @@ void collectSymbolsFromProgramHeaders( #if !defined USE_MUSL String getBuildIDFromProgramHeaders(dl_phdr_info * info) { + __msan_unpoison(&info->dlpi_phnum, sizeof(info->dlpi_phnum)); for (size_t header_index = 0; header_index < info->dlpi_phnum; ++header_index) { const ElfPhdr & phdr = info->dlpi_phdr[header_index]; + __msan_unpoison(&phdr, sizeof(phdr)); if (phdr.p_type != PT_NOTE) continue; - return Elf::getBuildID(reinterpret_cast(info->dlpi_addr + phdr.p_vaddr), phdr.p_memsz); + std::string_view view(reinterpret_cast(info->dlpi_addr + phdr.p_vaddr), phdr.p_memsz); + __msan_unpoison(view.data(), view.size()); + return Elf::getBuildID(view.data(), view.size()); } return {}; } From 9e513a147b5ca7ca0b75feec5488093f32df77d1 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 7 May 2023 04:22:01 +0200 Subject: [PATCH 05/15] Fixup --- src/Common/SymbolIndex.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Common/SymbolIndex.cpp b/src/Common/SymbolIndex.cpp index 79f97e93a2f1..394ae1a05920 100644 --- a/src/Common/SymbolIndex.cpp +++ b/src/Common/SymbolIndex.cpp @@ -138,6 +138,7 @@ void collectSymbolsFromProgramHeaders( * (first call is for the executable itself) */ __msan_unpoison(&info->dlpi_phnum, sizeof(info->dlpi_phnum)); + __msan_unpoison(&info->dlpi_phdr, sizeof(info->dlpi_phdr)); for (size_t header_index = 0; header_index < info->dlpi_phnum; ++header_index) { /* Further processing is only needed if the dynamic section is reached @@ -280,6 +281,7 @@ void collectSymbolsFromProgramHeaders( String getBuildIDFromProgramHeaders(dl_phdr_info * info) { __msan_unpoison(&info->dlpi_phnum, sizeof(info->dlpi_phnum)); + __msan_unpoison(&info->dlpi_phdr, sizeof(info->dlpi_phdr)); for (size_t header_index = 0; header_index < info->dlpi_phnum; ++header_index) { const ElfPhdr & phdr = info->dlpi_phdr[header_index]; From e5877b35bc41c2330857341f6d4dc37eb5b7336e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 4 Jul 2023 21:42:13 +0200 Subject: [PATCH 06/15] Preparation for #47475 --- docker/packager/binary/build.sh | 2 +- src/Common/SymbolIndex.cpp | 79 +++++++++++++++++++++------------ src/Common/getResource.cpp | 35 --------------- 3 files changed, 51 insertions(+), 65 deletions(-) diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index c0803c741475..a417ee04d3a2 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -64,7 +64,7 @@ then ninja $NINJA_FLAGS clickhouse-keeper ls -la ./programs/ - ldd ./programs/clickhouse-keeper + ldd ./programs/clickhouse-keeper ||: if [ -n "$MAKE_DEB" ]; then # No quotes because I want it to expand to nothing if empty. diff --git a/src/Common/SymbolIndex.cpp b/src/Common/SymbolIndex.cpp index f1cace5017c2..394ae1a05920 100644 --- a/src/Common/SymbolIndex.cpp +++ b/src/Common/SymbolIndex.cpp @@ -9,7 +9,6 @@ #include -//#include #include #include @@ -63,9 +62,11 @@ Otherwise you will get only exported symbols from program headers. #endif #define __msan_unpoison_string(X) // NOLINT +#define __msan_unpoison(X, Y) // NOLINT #if defined(ch_has_feature) # if ch_has_feature(memory_sanitizer) # undef __msan_unpoison_string +# undef __msan_unpoison # include # endif #endif @@ -136,10 +137,13 @@ void collectSymbolsFromProgramHeaders( /* Iterate over all headers of the current shared lib * (first call is for the executable itself) */ + __msan_unpoison(&info->dlpi_phnum, sizeof(info->dlpi_phnum)); + __msan_unpoison(&info->dlpi_phdr, sizeof(info->dlpi_phdr)); for (size_t header_index = 0; header_index < info->dlpi_phnum; ++header_index) { /* Further processing is only needed if the dynamic section is reached */ + __msan_unpoison(&info->dlpi_phdr[header_index], sizeof(info->dlpi_phdr[header_index])); if (info->dlpi_phdr[header_index].p_type != PT_DYNAMIC) continue; @@ -160,44 +164,53 @@ void collectSymbolsFromProgramHeaders( */ size_t sym_cnt = 0; - for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it) { - ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); - - // TODO: this branch leads to invalid address of the hash table. Need further investigation. - // if (it->d_tag == DT_HASH) - // { - // const ElfW(Word) * hash = reinterpret_cast(base_address); - // sym_cnt = hash[1]; - // break; - // } - if (it->d_tag == DT_GNU_HASH) + const auto * it = dyn_begin; + while (true) { - /// This code based on Musl-libc. + __msan_unpoison(it, sizeof(*it)); + if (it->d_tag != DT_NULL) + break; - const uint32_t * buckets = nullptr; - const uint32_t * hashval = nullptr; + ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); - const ElfW(Word) * hash = reinterpret_cast(base_address); + if (it->d_tag == DT_GNU_HASH) + { + /// This code based on Musl-libc. - buckets = hash + 4 + (hash[2] * sizeof(size_t) / 4); + const uint32_t * buckets = nullptr; + const uint32_t * hashval = nullptr; - for (ElfW(Word) i = 0; i < hash[0]; ++i) - if (buckets[i] > sym_cnt) - sym_cnt = buckets[i]; + const ElfW(Word) * hash = reinterpret_cast(base_address); - if (sym_cnt) - { - sym_cnt -= hash[1]; - hashval = buckets + hash[0] + sym_cnt; - do + __msan_unpoison(&hash[0], sizeof(*hash)); + __msan_unpoison(&hash[1], sizeof(*hash)); + __msan_unpoison(&hash[2], sizeof(*hash)); + + buckets = hash + 4 + (hash[2] * sizeof(size_t) / 4); + + __msan_unpoison(buckets, hash[0] * sizeof(buckets[0])); + + for (ElfW(Word) i = 0; i < hash[0]; ++i) + if (buckets[i] > sym_cnt) + sym_cnt = buckets[i]; + + if (sym_cnt) { - ++sym_cnt; + sym_cnt -= hash[1]; + hashval = buckets + hash[0] + sym_cnt; + __msan_unpoison(&hashval, sizeof(hashval)); + do + { + ++sym_cnt; + } + while (!(*hashval++ & 1)); } - while (!(*hashval++ & 1)); + + break; } - break; + ++it; } } @@ -228,6 +241,8 @@ void collectSymbolsFromProgramHeaders( /* Get the pointer to the first entry of the symbol table */ const ElfW(Sym) * elf_sym = reinterpret_cast(base_address); + __msan_unpoison(elf_sym, sym_cnt * sizeof(*elf_sym)); + /* Iterate over the symbol table */ for (ElfW(Word) sym_index = 0; sym_index < ElfW(Word)(sym_cnt); ++sym_index) { @@ -235,6 +250,7 @@ void collectSymbolsFromProgramHeaders( * This is located at the address of st_name relative to the beginning of the string table. */ const char * sym_name = &strtab[elf_sym[sym_index].st_name]; + __msan_unpoison_string(sym_name); if (!sym_name) continue; @@ -264,13 +280,18 @@ void collectSymbolsFromProgramHeaders( #if !defined USE_MUSL String getBuildIDFromProgramHeaders(dl_phdr_info * info) { + __msan_unpoison(&info->dlpi_phnum, sizeof(info->dlpi_phnum)); + __msan_unpoison(&info->dlpi_phdr, sizeof(info->dlpi_phdr)); for (size_t header_index = 0; header_index < info->dlpi_phnum; ++header_index) { const ElfPhdr & phdr = info->dlpi_phdr[header_index]; + __msan_unpoison(&phdr, sizeof(phdr)); if (phdr.p_type != PT_NOTE) continue; - return Elf::getBuildID(reinterpret_cast(info->dlpi_addr + phdr.p_vaddr), phdr.p_memsz); + std::string_view view(reinterpret_cast(info->dlpi_addr + phdr.p_vaddr), phdr.p_memsz); + __msan_unpoison(view.data(), view.size()); + return Elf::getBuildID(view.data(), view.size()); } return {}; } diff --git a/src/Common/getResource.cpp b/src/Common/getResource.cpp index fe603fcc5502..09777640dd94 100644 --- a/src/Common/getResource.cpp +++ b/src/Common/getResource.cpp @@ -1,6 +1,4 @@ #include "getResource.h" -#include -#include #include #include @@ -14,39 +12,6 @@ std::string_view getResource(std::string_view name) std::replace(name_replaced.begin(), name_replaced.end(), '.', '_'); boost::replace_all(name_replaced, "+", "_PLUS_"); -#if defined USE_MUSL /// If static linking is used, we cannot use dlsym and have to parse ELF symbol table by ourself. return DB::SymbolIndex::instance()->getResource(name_replaced); - -#else - // In most `dlsym(3)` APIs, one passes the symbol name as it appears via - // something like `nm` or `objdump -t`. For example, a symbol `_foo` would be - // looked up with the string `"_foo"`. - // - // Apple's linker is confusingly different. The NOTES on the man page for - // `dlsym(3)` claim that one looks up the symbol with "the name used in C - // source code". In this example, that would mean using the string `"foo"`. - // This apparently applies even in the case where the symbol did not originate - // from C source, such as the embedded binary resource files used here. So - // the symbol name must not have a leading `_` on Apple platforms. It's not - // clear how this applies to other symbols, such as those which _have_ a leading - // underscore in them by design, many leading underscores, etc. -#if defined OS_DARWIN - std::string prefix = "binary_"; -#else - std::string prefix = "_binary_"; -#endif - std::string symbol_name_start = prefix + name_replaced + "_start"; - std::string symbol_name_end = prefix + name_replaced + "_end"; - - const char * sym_start = reinterpret_cast(dlsym(RTLD_DEFAULT, symbol_name_start.c_str())); - const char * sym_end = reinterpret_cast(dlsym(RTLD_DEFAULT, symbol_name_end.c_str())); - - if (sym_start && sym_end) - { - auto resource_size = static_cast(std::distance(sym_start, sym_end)); - return { sym_start, resource_size }; - } - return {}; -#endif } From 95eec11dd4d18bc3debf9996a56c48b02e2932f7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 5 Jul 2023 02:10:33 +0200 Subject: [PATCH 07/15] Fix error --- src/Common/SymbolIndex.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Common/SymbolIndex.cpp b/src/Common/SymbolIndex.cpp index 394ae1a05920..afa70257be95 100644 --- a/src/Common/SymbolIndex.cpp +++ b/src/Common/SymbolIndex.cpp @@ -1,7 +1,6 @@ #if defined(__ELF__) && !defined(OS_FREEBSD) #include -#include #include #include @@ -151,6 +150,7 @@ void collectSymbolsFromProgramHeaders( * It's address is the shared lib's address + the virtual address */ const ElfW(Dyn) * dyn_begin = reinterpret_cast(info->dlpi_addr + info->dlpi_phdr[header_index].p_vaddr); + __msan_unpoison(&dyn_begin, sizeof(dyn_begin)); /// For unknown reason, addresses are sometimes relative sometimes absolute. auto correct_address = [](ElfW(Addr) base, ElfW(Addr) ptr) @@ -385,6 +385,7 @@ void collectSymbolsFromELF( build_id = our_build_id; #else /// MSan does not know that the program segments in memory are initialized. + __msan_unpoison(info, sizeof(*info)); __msan_unpoison_string(info->dlpi_name); object_name = info->dlpi_name; From da0a332d46a7939377dca08147897d0f5b1dd9c1 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 24 Jul 2023 18:09:38 +0200 Subject: [PATCH 08/15] Keep symtab --- cmake/split_debug_symbols.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmake/split_debug_symbols.cmake b/cmake/split_debug_symbols.cmake index a9c2158359a5..8ba7669d9e2f 100644 --- a/cmake/split_debug_symbols.cmake +++ b/cmake/split_debug_symbols.cmake @@ -22,8 +22,9 @@ macro(clickhouse_split_debug_symbols) # Splits debug symbols into separate file, leaves the binary untouched: COMMAND "${OBJCOPY_PATH}" --only-keep-debug "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" - # Strips binary, sections '.note' & '.comment' are removed in line with Debian's stripping policy: www.debian.org/doc/debian-policy/ch-files.html, section '.clickhouse.hash' is needed for integrity check: - COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note --keep-section=.clickhouse.hash "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" + # Strips binary, sections '.note' & '.comment' are removed in line with Debian's stripping policy: www.debian.org/doc/debian-policy/ch-files.html, section '.clickhouse.hash' is needed for integrity check. + # Also, after we disabled the export of symbols for dynamic linking, we still to keep a static symbol table for good stack traces. + COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note --keep-section=.clickhouse.hash --keep-section=.symtab --keep-section=.strtab --keep-section=.shstrtab "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" # Associate stripped binary with debug symbols: COMMAND "${OBJCOPY_PATH}" --add-gnu-debuglink "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" COMMENT "Stripping clickhouse binary" VERBATIM From 6aab4cc83508093d19ed84130be483e73c08b324 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 27 Jul 2023 05:25:40 +0200 Subject: [PATCH 09/15] Check for unexpected cyrillic --- src/Compression/CompressionCodecEncrypted.cpp | 2 +- src/Disks/DiskEncrypted.cpp | 4 ++-- src/IO/examples/read_buffer.cpp | 4 ++-- src/IO/examples/write_buffer.cpp | 2 +- src/IO/examples/write_buffer_perf.cpp | 2 +- src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp | 2 +- utils/check-style/check-style | 3 +++ 7 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/Compression/CompressionCodecEncrypted.cpp b/src/Compression/CompressionCodecEncrypted.cpp index 3f4e35a78a46..34b621291fd8 100644 --- a/src/Compression/CompressionCodecEncrypted.cpp +++ b/src/Compression/CompressionCodecEncrypted.cpp @@ -493,7 +493,7 @@ void CompressionCodecEncrypted::Configuration::loadImpl( /// If there is only one key with non zero ID, curren_key_id should be defined. if (new_params->keys_storage[method].size() == 1 && !new_params->keys_storage[method].contains(0)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Config has one key with non zero id. сurrent_key_id is required"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Config has one key with non zero id. current_key_id is required"); } /// Try to find which key will be used for encryption. If there is no current_key and only one key without id diff --git a/src/Disks/DiskEncrypted.cpp b/src/Disks/DiskEncrypted.cpp index 441e639b9675..aa9c4a92adc8 100644 --- a/src/Disks/DiskEncrypted.cpp +++ b/src/Disks/DiskEncrypted.cpp @@ -433,10 +433,10 @@ void DiskEncrypted::applyNewSettings( { auto new_settings = parseDiskEncryptedSettings(name, config, config_prefix, disk_map); if (new_settings->wrapped_disk != delegate) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Сhanging wrapped disk on the fly is not supported. Disk {}", name); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Changing wrapped disk on the fly is not supported. Disk {}", name); if (new_settings->disk_path != disk_path) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Сhanging disk path on the fly is not supported. Disk {}", name); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Changing disk path on the fly is not supported. Disk {}", name); current_settings.set(std::move(new_settings)); IDisk::applyNewSettings(config, context, config_prefix, disk_map); diff --git a/src/IO/examples/read_buffer.cpp b/src/IO/examples/read_buffer.cpp index 85675c0d6131..221da24715b8 100644 --- a/src/IO/examples/read_buffer.cpp +++ b/src/IO/examples/read_buffer.cpp @@ -40,7 +40,7 @@ int readAndPrint(DB::ReadBuffer & in) int main(int, char **) { { - std::string s = "-123456 123.456 вася пе\\tтя\t'\\'xyz\\\\'"; + std::string s = "-123456 123.456 вася pe\\ttya\t'\\'xyz\\\\'"; DB::ReadBufferFromString in(s); if (readAndPrint(in)) std::cout << "readAndPrint from ReadBufferFromString failed" << std::endl; @@ -49,7 +49,7 @@ int main(int, char **) std::shared_ptr in; { - std::string s = "-123456 123.456 вася пе\\tтя\t'\\'xyz\\\\'"; + std::string s = "-123456 123.456 вася pe\\ttya\t'\\'xyz\\\\'"; in = std::make_shared(s); } if (readAndPrint(*in)) diff --git a/src/IO/examples/write_buffer.cpp b/src/IO/examples/write_buffer.cpp index bca0be24b1ad..999f9b1bb34f 100644 --- a/src/IO/examples/write_buffer.cpp +++ b/src/IO/examples/write_buffer.cpp @@ -14,7 +14,7 @@ int main(int, char **) { DB::Int64 a = -123456; DB::Float64 b = 123.456; - DB::String c = "вася пе\tтя"; + DB::String c = "вася pe\ttya"; DB::String d = "'xyz\\"; std::stringstream s; // STYLE_CHECK_ALLOW_STD_STRING_STREAM diff --git a/src/IO/examples/write_buffer_perf.cpp b/src/IO/examples/write_buffer_perf.cpp index 0b3d0a612419..3f57ddb9a4f7 100644 --- a/src/IO/examples/write_buffer_perf.cpp +++ b/src/IO/examples/write_buffer_perf.cpp @@ -14,7 +14,7 @@ int main(int, char **) { DB::Int64 a = -123456; DB::Float64 b = 123.456; - DB::String c = "вася пе\tтя"; + DB::String c = "вася pe\ttya"; DB::String d = "'xyz\\"; std::ofstream s("test"); diff --git a/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp b/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp index 229a0630328e..cff83b0ad3b6 100644 --- a/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp @@ -180,7 +180,7 @@ Chunk ParquetMetadataInputFormat::generate() else if (name == names[3]) { auto column = types[3]->createColumn(); - /// Version сan be only PARQUET_1_0 or PARQUET_2_LATEST (which is 2.6). + /// Version can be only PARQUET_1_0 or PARQUET_2_LATEST (which is 2.6). String version = metadata->version() == parquet::ParquetVersion::PARQUET_1_0 ? "1.0" : "2.6"; assert_cast(*column).insertData(version.data(), version.size()); res.addColumn(std::move(column)); diff --git a/utils/check-style/check-style b/utils/check-style/check-style index c28ca1cfc8a9..67c185a0b544 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -413,3 +413,6 @@ find $ROOT_PATH/tests/queries/1_stateful -name '*.sql' -or -name '*.sh' | grep - # Check for bad punctuation: whitespace before comma. find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -P --line-number '\w ,' | grep -v 'bad punctuation is ok here' && echo "^ There is bad punctuation: whitespace before comma. You should write it like this: 'Hello, world!'" + +# Cyrillic characters hiding inside Latin. +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -P --line-number '[a-zA-Z][а-яА-ЯёЁ]|[а-яА-ЯёЁ][a-zA-Z]' && echo "^ Cyrillic characters found in unexpected place." From 63b155a74111e7f2b1b94808e8f9aa47374bc71f Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sat, 29 Jul 2023 23:12:54 +0000 Subject: [PATCH 10/15] bypass merging attempt for non 'clickhouse' configurations with different root node names --- src/Common/Config/ConfigProcessor.cpp | 13 +++++++++++-- src/Common/Config/ConfigProcessor.h | 4 +++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/Common/Config/ConfigProcessor.cpp b/src/Common/Config/ConfigProcessor.cpp index a55183782d83..04f55600b409 100644 --- a/src/Common/Config/ConfigProcessor.cpp +++ b/src/Common/Config/ConfigProcessor.cpp @@ -328,7 +328,7 @@ void ConfigProcessor::mergeRecursive(XMLDocumentPtr config, Node * config_root, } } -void ConfigProcessor::merge(XMLDocumentPtr config, XMLDocumentPtr with) +bool ConfigProcessor::merge(XMLDocumentPtr config, XMLDocumentPtr with) { Node * config_root = getRootNode(config.get()); Node * with_root = getRootNode(with.get()); @@ -343,11 +343,15 @@ void ConfigProcessor::merge(XMLDocumentPtr config, XMLDocumentPtr with) && !((config_root_node_name == "yandex" || config_root_node_name == "clickhouse") && (merged_root_node_name == "yandex" || merged_root_node_name == "clickhouse"))) { + if (config_root_node_name != "clickhouse" && config_root_node_name != "yandex") + return false; + throw Poco::Exception("Root element doesn't have the corresponding root element as the config file." " It must be <" + config_root->nodeName() + ">"); } mergeRecursive(config, config_root, with_root); + return true; } void ConfigProcessor::doIncludesRecursive( @@ -645,7 +649,12 @@ XMLDocumentPtr ConfigProcessor::processConfig( with = dom_parser.parse(merge_file); } - merge(config, with); + if (!merge(config, with)) + { + LOG_DEBUG(log, "Merging bypassed - configuration file '{}' doesn't belong to configuration '{}' - merging root node name '{}' doesn't match '{}'", + merge_file, path, getRootNode(with.get())->nodeName(), getRootNode(config.get())->nodeName()); + continue; + } contributing_files.push_back(merge_file); } diff --git a/src/Common/Config/ConfigProcessor.h b/src/Common/Config/ConfigProcessor.h index 060ef49d36a3..b4f85b105265 100644 --- a/src/Common/Config/ConfigProcessor.h +++ b/src/Common/Config/ConfigProcessor.h @@ -144,7 +144,9 @@ class ConfigProcessor void mergeRecursive(XMLDocumentPtr config, Poco::XML::Node * config_root, const Poco::XML::Node * with_root); - void merge(XMLDocumentPtr config, XMLDocumentPtr with); + /// If config root node name is not 'clickhouse' and merging config's root node names doesn't match, bypasses merging and returns false. + /// For compatibility root node 'yandex' considered equal to 'clickhouse'. + bool merge(XMLDocumentPtr config, XMLDocumentPtr with); void doIncludesRecursive( XMLDocumentPtr config, From 4d27cf4386e1b3b3d0b1516723fbef0f9143ebcc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 30 Jul 2023 02:11:32 +0200 Subject: [PATCH 11/15] Proper stripping --- cmake/split_debug_symbols.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/split_debug_symbols.cmake b/cmake/split_debug_symbols.cmake index 8ba7669d9e2f..d6821eb6c482 100644 --- a/cmake/split_debug_symbols.cmake +++ b/cmake/split_debug_symbols.cmake @@ -24,7 +24,7 @@ macro(clickhouse_split_debug_symbols) COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" # Strips binary, sections '.note' & '.comment' are removed in line with Debian's stripping policy: www.debian.org/doc/debian-policy/ch-files.html, section '.clickhouse.hash' is needed for integrity check. # Also, after we disabled the export of symbols for dynamic linking, we still to keep a static symbol table for good stack traces. - COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note --keep-section=.clickhouse.hash --keep-section=.symtab --keep-section=.strtab --keep-section=.shstrtab "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" + COMMAND "${STRIP_PATH}" --strip-debug --remove-section=.comment --remove-section=.note "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" # Associate stripped binary with debug symbols: COMMAND "${OBJCOPY_PATH}" --add-gnu-debuglink "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" COMMENT "Stripping clickhouse binary" VERBATIM From a4f2eea51603bb98340e8c6add215ca8b7fc676d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 30 Jul 2023 02:19:39 +0200 Subject: [PATCH 12/15] Maybe better --- utils/check-style/check-style | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/check-style/check-style b/utils/check-style/check-style index 67c185a0b544..b728602ef408 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -12,6 +12,7 @@ # (simple regexps) to check if the code is likely to have basic style violations. # and then to run formatter only for the specified files. +LC_ALL="en_US.UTF-8" ROOT_PATH=$(git rev-parse --show-toplevel) EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|poco/|memcpy/|consistent-hashing|benchmark|tests/.*.cpp|utils/keeper-bench/example.yaml' From 1deebd3ae624232e2b9ae05e7b1350816ef490ee Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 30 Jul 2023 04:22:36 +0200 Subject: [PATCH 13/15] Fix Docker --- docker/test/base/Dockerfile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docker/test/base/Dockerfile b/docker/test/base/Dockerfile index f68368044541..bfbe9461c1ea 100644 --- a/docker/test/base/Dockerfile +++ b/docker/test/base/Dockerfile @@ -11,6 +11,7 @@ RUN apt-get update \ pv \ ripgrep \ zstd \ + locales \ --yes --no-install-recommends # Sanitizer options for services (clickhouse-server) @@ -28,7 +29,13 @@ ENV TSAN_OPTIONS='halt_on_error=1 history_size=7 memory_limit_mb=46080 second_de ENV UBSAN_OPTIONS='print_stacktrace=1' ENV MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1' +RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8 +ENV LC_ALL en_US.UTF-8 + ENV TZ=Europe/Moscow RUN ln -snf "/usr/share/zoneinfo/$TZ" /etc/localtime && echo "$TZ" > /etc/timezone CMD sleep 1 + + +RUN apt-get update && apt-get install -y locales From ecfde7edc6ad3292739ada60ec36c8f9413f9ffe Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 30 Jul 2023 05:20:51 +0200 Subject: [PATCH 14/15] Fix Docker --- docker/test/style/Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index 2aa0b1a62d63..bd1c26855b77 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -18,9 +18,13 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ python3-pip \ shellcheck \ yamllint \ + locales \ && pip3 install black==23.1.0 boto3 codespell==2.2.1 mypy==1.3.0 PyGithub unidiff pylint==2.6.2 \ && apt-get clean \ - && rm -rf /root/.cache/pip + && rm -rf /root/.cache/pip + +RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8 +ENV LC_ALL en_US.UTF-8 # Architecture of the image when BuildKit/buildx is used ARG TARGETARCH From 424fe43090fce65d564997452a85e5d617271b52 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 30 Jul 2023 06:57:11 +0300 Subject: [PATCH 15/15] Update Dockerfile --- docker/test/base/Dockerfile | 3 --- 1 file changed, 3 deletions(-) diff --git a/docker/test/base/Dockerfile b/docker/test/base/Dockerfile index bfbe9461c1ea..6ba44662af96 100644 --- a/docker/test/base/Dockerfile +++ b/docker/test/base/Dockerfile @@ -36,6 +36,3 @@ ENV TZ=Europe/Moscow RUN ln -snf "/usr/share/zoneinfo/$TZ" /etc/localtime && echo "$TZ" > /etc/timezone CMD sleep 1 - - -RUN apt-get update && apt-get install -y locales