From 8912b217556bdb236339393cbd86722b67801b61 Mon Sep 17 00:00:00 2001 From: David Wrighton Date: Thu, 8 Aug 2024 15:26:12 -0700 Subject: [PATCH] [release/8.0-staging] Enable TLS on linux/arm64 only for static resolver (#106101) * disable TLS for dynamic resolver * add check to skip nop for older resolver * Add test and config variables * Apply suggestions from code review Co-authored-by: Jan Kotas * Set all the tls values in the test explicitly * Fix build on OSX arm64 --------- Co-authored-by: Kunal Pathak Co-authored-by: Jan Kotas --- src/coreclr/hosts/corerun/corerun.cpp | 17 +++ src/coreclr/hosts/corerun/corerun.hpp | 23 ++++ src/coreclr/inc/clrconfigvalues.h | 2 + src/coreclr/vm/arm64/asmhelpers.S | 14 ++ src/coreclr/vm/eeconfig.cpp | 3 + src/coreclr/vm/eeconfig.h | 2 + src/coreclr/vm/jitinterface.cpp | 43 +++++- src/tests/JIT/Directed/tls/CMakeLists.txt | 8 ++ .../JIT/Directed/tls/TestTLSWithLoadedDlls.cs | 130 ++++++++++++++++++ .../Directed/tls/TestTLSWithLoadedDlls.csproj | 16 +++ src/tests/JIT/Directed/tls/testtls.cpp | 46 +++++++ 11 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 src/tests/JIT/Directed/tls/CMakeLists.txt create mode 100644 src/tests/JIT/Directed/tls/TestTLSWithLoadedDlls.cs create mode 100644 src/tests/JIT/Directed/tls/TestTLSWithLoadedDlls.csproj create mode 100644 src/tests/JIT/Directed/tls/testtls.cpp diff --git a/src/coreclr/hosts/corerun/corerun.cpp b/src/coreclr/hosts/corerun/corerun.cpp index 4e506b95c9746..858f72297f6bd 100644 --- a/src/coreclr/hosts/corerun/corerun.cpp +++ b/src/coreclr/hosts/corerun/corerun.cpp @@ -472,6 +472,7 @@ static void display_usage() W(" -p, --property - Property to pass to runtime during initialization.\n") W(" If a property value contains spaces, quote the entire argument.\n") W(" May be supplied multiple times. Format: =.\n") + W(" -l, --preload - path to shared library to load before loading the CLR.\n") W(" -d, --debug - causes corerun to wait for a debugger to attach before executing.\n") W(" -e, --env - path to a .env file with environment variables that corerun should set.\n") W(" -?, -h, --help - show this help.\n") @@ -569,6 +570,22 @@ static bool parse_args( config.user_defined_keys.push_back(std::move(key)); config.user_defined_values.push_back(std::move(value)); } + else if (pal::strcmp(option, W("l")) == 0 || (pal::strcmp(option, W("preload")) == 0)) + { + i++; + if (i >= argc) + { + pal::fprintf(stderr, W("Option %s: missing shared library path\n"), arg); + break; + } + + string_t library = argv[i]; + pal::mod_t hMod; + if (!pal::try_load_library(library, hMod)) + { + break; + } + } else if (pal::strcmp(option, W("d")) == 0 || (pal::strcmp(option, W("debug")) == 0)) { config.wait_to_debug = true; diff --git a/src/coreclr/hosts/corerun/corerun.hpp b/src/coreclr/hosts/corerun/corerun.hpp index f0e51a668fe22..567d7b9dc5062 100644 --- a/src/coreclr/hosts/corerun/corerun.hpp +++ b/src/coreclr/hosts/corerun/corerun.hpp @@ -224,6 +224,17 @@ namespace pal return hMod != nullptr; } + inline bool try_load_library(const pal::string_t& path, pal::mod_t& hMod) + { + hMod = (pal::mod_t)::LoadLibraryExW(path.c_str(), nullptr, 0); + if (hMod == nullptr) + { + pal::fprintf(stderr, W("Failed to load: '%s'. Error: 0x%08x\n"), path.c_str(), ::GetLastError()); + return false; + } + return true; + } + inline bool try_load_coreclr(const pal::string_t& core_root, pal::mod_t& hMod) { pal::string_t coreclr_path = core_root; @@ -602,6 +613,18 @@ namespace pal return hMod != nullptr; } + inline bool try_load_library(const pal::string_t& path, pal::mod_t& hMod) + { + hMod = (pal::mod_t)dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (hMod == nullptr) + { + pal::fprintf(stderr, W("Failed to load: '%s'. Error: %s\n"), path.c_str(), dlerror()); + return false; + } + return true; + } + + inline bool try_load_coreclr(const pal::string_t& core_root, pal::mod_t& hMod) { pal::string_t coreclr_path = core_root; diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index 7b356b0744399..13962a2060905 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -342,6 +342,8 @@ CONFIG_STRING_INFO(INTERNAL_TailCallMax, W("TailCallMax"), "") RETAIL_CONFIG_STRING_INFO(EXTERNAL_TailCallOpt, W("TailCallOpt"), "") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_TailCallLoopOpt, W("TailCallLoopOpt"), 1, "Convert recursive tail calls to loops") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_Jit_NetFx40PInvokeStackResilience, W("NetFx40_PInvokeStackResilience"), (DWORD)-1, "Makes P/Invoke resilient against mismatched signature and calling convention (significant perf penalty).") +RETAIL_CONFIG_DWORD_INFO(EXTERNAL_DisableOptimizedThreadStaticAccess, W("DisableOptimizedThreadStaticAccess"), (DWORD)0, "Disable the OptimizedThreadStaticAccess feature.") +CONFIG_DWORD_INFO(EXTERNAL_AssertNotStaticTlsResolver, W("AssertNotStaticTlsResolver"), (DWORD)0, "Assert if we attempt to use the static tls resolver path.") // AltJitAssertOnNYI should be 0 on targets where JIT is under development or bring up stage, so as to facilitate fallback to main JIT on hitting a NYI. #if defined(TARGET_X86) diff --git a/src/coreclr/vm/arm64/asmhelpers.S b/src/coreclr/vm/arm64/asmhelpers.S index 89dab80461c35..f766b9691278e 100644 --- a/src/coreclr/vm/arm64/asmhelpers.S +++ b/src/coreclr/vm/arm64/asmhelpers.S @@ -1035,4 +1035,18 @@ LEAF_ENTRY GetThreadStaticsVariableOffset, _TEXT EPILOG_RETURN LEAF_END GetThreadStaticsVariableOffset, _TEXT // ------------------------------------------------------------------ + +// ------------------------------------------------------------------ +// size_t GetTLSResolverAddress() + +// Helper to get the TLS resolver address. This will be then used to determine if we have a static or dynamic resolver. +LEAF_ENTRY GetTLSResolverAddress, _TEXT + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -32 + adrp x0, :tlsdesc:t_ThreadStatics + ldr x1, [x0, #:tlsdesc_lo12:t_ThreadStatics] + mov x0, x1 + EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 32 + EPILOG_RETURN +LEAF_END GetTLSResolverAddress, _TEXT +// ------------------------------------------------------------------ #endif // !TARGET_OSX diff --git a/src/coreclr/vm/eeconfig.cpp b/src/coreclr/vm/eeconfig.cpp index 0f1de4d4fe788..37a89224617ce 100644 --- a/src/coreclr/vm/eeconfig.cpp +++ b/src/coreclr/vm/eeconfig.cpp @@ -112,6 +112,7 @@ HRESULT EEConfig::Init() fJitFramed = false; fJitMinOpts = false; fJitEnableOptionalRelocs = false; + fDisableOptimizedThreadStaticAccess = false; fPInvokeRestoreEsp = (DWORD)-1; fNgenBindOptimizeNonGac = false; @@ -548,6 +549,8 @@ HRESULT EEConfig::sync() iJitOptimizeType = CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_JitOptimizeType); if (iJitOptimizeType > OPT_RANDOM) iJitOptimizeType = OPT_DEFAULT; + fDisableOptimizedThreadStaticAccess = CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_DisableOptimizedThreadStaticAccess) != 0; + #ifdef TARGET_X86 fPInvokeRestoreEsp = CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_Jit_NetFx40PInvokeStackResilience); #endif diff --git a/src/coreclr/vm/eeconfig.h b/src/coreclr/vm/eeconfig.h index 6394f1b998c0f..3614e54320295 100644 --- a/src/coreclr/vm/eeconfig.h +++ b/src/coreclr/vm/eeconfig.h @@ -77,6 +77,7 @@ class EEConfig bool JitFramed(void) const {LIMITED_METHOD_CONTRACT; return fJitFramed; } bool JitMinOpts(void) const {LIMITED_METHOD_CONTRACT; return fJitMinOpts; } bool JitEnableOptionalRelocs(void) const {LIMITED_METHOD_CONTRACT; return fJitEnableOptionalRelocs; } + bool DisableOptimizedThreadStaticAccess(void) const {LIMITED_METHOD_CONTRACT; return fDisableOptimizedThreadStaticAccess; } // Tiered Compilation config #if defined(FEATURE_TIERED_COMPILATION) @@ -480,6 +481,7 @@ class EEConfig bool fJitFramed; // Enable/Disable EBP based frames bool fJitMinOpts; // Enable MinOpts for all jitted methods bool fJitEnableOptionalRelocs; // Allow optional relocs + bool fDisableOptimizedThreadStaticAccess; // Disable OptimizedThreadStatic access unsigned iJitOptimizeType; // 0=Blended,1=SmallCode,2=FastCode, default is 0=Blended diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index 774dffe355909..01ab9aae94463 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -1463,6 +1463,10 @@ void CEEInfo::getThreadLocalStaticBlocksInfo (CORINFO_THREAD_STATIC_BLOCKS_INFO* EE_TO_JIT_TRANSITION_LEAF(); } +#if !defined(TARGET_OSX) && defined(TARGET_UNIX) && defined(TARGET_ARM64) +extern "C" size_t GetTLSResolverAddress(); +#endif // !TARGET_OSX && TARGET_UNIX && TARGET_ARM64 + /*********************************************************************/ void CEEInfo::getFieldInfo (CORINFO_RESOLVED_TOKEN * pResolvedToken, CORINFO_METHOD_HANDLE callerHandle, @@ -1567,14 +1571,46 @@ void CEEInfo::getFieldInfo (CORINFO_RESOLVED_TOKEN * pResolvedToken, fieldAccessor = CORINFO_FIELD_STATIC_SHARED_STATIC_HELPER; pResult->helper = getSharedStaticsHelper(pField, pFieldMT); + + bool optimizeThreadStaticAccess = false; #if defined(TARGET_ARM) // Optimization is disabled for linux/windows arm #elif !defined(TARGET_WINDOWS) && defined(TARGET_X86) // Optimization is disabled for linux/x86 #elif defined(TARGET_LINUX_MUSL) && defined(TARGET_ARM64) // Optimization is disabled for linux musl arm64 +#elif !defined(TARGET_OSX) && defined(TARGET_UNIX) && defined(TARGET_ARM64) + // Optimization is enabled for linux/arm64 only for static resolver. + // For static resolver, the TP offset is same for all threads. + // For dynamic resolver, TP offset returned is for the current thread and + // will be different for the other threads. + uint32_t* resolverAddress = reinterpret_cast(GetTLSResolverAddress()); + int ip = 0; + if ((resolverAddress[ip] == 0xd503201f) || (resolverAddress[ip] == 0xd503241f)) + { + // nop might not be present in older resolver, so skip it. + + // nop or hint 32 + ip++; + } + + if ( + // ldr x0, [x0, #8] + (resolverAddress[ip] == 0xf9400400) && + // ret + (resolverAddress[ip + 1] == 0xd65f03c0) + ) + { + optimizeThreadStaticAccess = true; +#ifdef _DEBUG + if (CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_AssertNotStaticTlsResolver) != 0) + { + _ASSERTE(!"Detected static resolver in use when not expected"); + } +#endif + } #else - bool optimizeThreadStaticAccess = true; + optimizeThreadStaticAccess = true; #if !defined(TARGET_OSX) && defined(TARGET_UNIX) && defined(TARGET_AMD64) // For linux/x64, check if compiled coreclr as .so file and not single file. // For single file, the `tls_index` might not be accurate. @@ -1582,6 +1618,11 @@ void CEEInfo::getFieldInfo (CORINFO_RESOLVED_TOKEN * pResolvedToken, optimizeThreadStaticAccess = GetTlsIndexObjectAddress() != nullptr; #endif // !TARGET_OSX && TARGET_UNIX && TARGET_AMD64 + if (g_pConfig->DisableOptimizedThreadStaticAccess()) + { + optimizeThreadStaticAccess = false; + } + if (optimizeThreadStaticAccess) { // For windows x64/x86/arm64, linux x64/arm64/loongarch64/riscv64: diff --git a/src/tests/JIT/Directed/tls/CMakeLists.txt b/src/tests/JIT/Directed/tls/CMakeLists.txt new file mode 100644 index 0000000000000..3eaf0d3b22a0c --- /dev/null +++ b/src/tests/JIT/Directed/tls/CMakeLists.txt @@ -0,0 +1,8 @@ +# Licensed to the .NET Foundation under one or more agreements. +# The .NET Foundation licenses this file to you under the MIT license. + +include_directories(${INC_PLATFORM_DIR}) + +add_library(usetls SHARED testtls.cpp) + +install (TARGETS usetls DESTINATION bin) diff --git a/src/tests/JIT/Directed/tls/TestTLSWithLoadedDlls.cs b/src/tests/JIT/Directed/tls/TestTLSWithLoadedDlls.cs new file mode 100644 index 0000000000000..18385bb14cd61 --- /dev/null +++ b/src/tests/JIT/Directed/tls/TestTLSWithLoadedDlls.cs @@ -0,0 +1,130 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + + +// This test is verifying that the runtime properly handles the cases where the TLS infra in the runtime is forced +// to use a dynamic resolver. This is done by means of a private config variable to validate the behavior on Linux Arm64 +// and a set of multithreaded tasks, that has been known to cause the runtime to crash when this is handled incorrectly. + +using System; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Reflection; +using System.Runtime.InteropServices; +using System.Runtime.Loader; +using System.Text; +using System.Threading; +using System.Threading.Tasks; + +namespace TestTLSWithLoadedDlls +{ + static class TLSWithLoadedDlls + { + private const int CountOfLibTlsToLoad = 40; + + static async Task DoLotsOfAsyncWork(int loopCount) + { + for (int i = 0; i < loopCount; i++) + { + Console.WriteLine("Starting a new batch of tasks..."); + var tasks = Enumerable.Range(1, 100).Select(i => Task.Run(async () => + { + await Task.Delay(1); + })).ToArray(); + + await Task.WhenAll(tasks); + + Console.WriteLine("Batch of tasks completed. Main loop sleeping for 20 ms..."); + await Task.Delay(20); + } + } + + static int Main(string[] args) + { + if ((args.Length == 1) && (args[0] == "RunLotsOfTasks")) + { + DoLotsOfAsyncWork(100).GetAwaiter().GetResult(); + return 100; + } + + int CountOfLibTlsToLoad = 60; + + if (OperatingSystem.IsWindows()) // Windows does not have a really long command line length limit, and doesn't have a problem with many TLS using images used + CountOfLibTlsToLoad = 10; + + StringBuilder arguments = new(); + + (string prefix, string suffix) = GetSharedLibraryPrefixSuffix(); + + string UseTlsFileName = GetSharedLibraryFileNameForCurrentPlatform("usetls"); + string testDirectory = Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location); + string UseTlsFilePath = Path.Combine(testDirectory, UseTlsFileName); + + for (int i = 0; i < CountOfLibTlsToLoad; i++) + { + string tlsNumberSpecificPath = Path.Combine(testDirectory, i.ToString()); + string finalUseTlsPath = Path.Combine(tlsNumberSpecificPath, prefix + "usetls" + suffix); + + Directory.CreateDirectory(tlsNumberSpecificPath); + if (!File.Exists(finalUseTlsPath)) + { + File.Copy( + UseTlsFilePath, + finalUseTlsPath); + } + + arguments.Append(" -l "); + arguments.Append(finalUseTlsPath); + } + + arguments.Append(' '); + arguments.Append(System.Reflection.Assembly.GetExecutingAssembly().Location); + arguments.Append(" RunLotsOfTasks"); + + Process process = new Process(); + process.StartInfo.FileName = GetCorerunPath(); + process.StartInfo.Arguments = arguments.ToString(); + process.StartInfo.UseShellExecute = false; + process.StartInfo.EnvironmentVariables["DOTNET_AssertNotStaticTlsResolver"] = "1"; + + Console.WriteLine($"Launching {process.StartInfo.FileName} {process.StartInfo.Arguments}"); + + process.Start(); + process.WaitForExit(); + return process.ExitCode; + } + + private static string GetCorerunPath() + { + string corerunName; + if (OperatingSystem.IsWindows()) + { + corerunName = "CoreRun.exe"; + } + else + { + corerunName = "corerun"; + } + + return Path.Combine(Environment.GetEnvironmentVariable("CORE_ROOT"), corerunName); + } + + public static (string, string) GetSharedLibraryPrefixSuffix() + { + if (OperatingSystem.IsWindows()) + return (string.Empty, ".dll"); + + if (OperatingSystem.IsMacOS()) + return ("lib", ".dylib"); + + return ("lib", ".so"); + } + + public static string GetSharedLibraryFileNameForCurrentPlatform(string libraryName) + { + (string prefix, string suffix) = GetSharedLibraryPrefixSuffix(); + return prefix + libraryName + suffix; + } + } +} diff --git a/src/tests/JIT/Directed/tls/TestTLSWithLoadedDlls.csproj b/src/tests/JIT/Directed/tls/TestTLSWithLoadedDlls.csproj new file mode 100644 index 0000000000000..c17a210af0c9d --- /dev/null +++ b/src/tests/JIT/Directed/tls/TestTLSWithLoadedDlls.csproj @@ -0,0 +1,16 @@ + + + 0 + true + false + true + + + PdbOnly + True + + + + + + diff --git a/src/tests/JIT/Directed/tls/testtls.cpp b/src/tests/JIT/Directed/tls/testtls.cpp new file mode 100644 index 0000000000000..24dfbe5d0af2c --- /dev/null +++ b/src/tests/JIT/Directed/tls/testtls.cpp @@ -0,0 +1,46 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#ifdef _MSC_VER +#define DLLEXPORT __declspec(dllexport) +#else +#define DLLEXPORT __attribute__((visibility("default"))) +#endif // _MSC_VER + +thread_local int tls0; +thread_local int tls1; +thread_local int tls2; +thread_local int tls3; +thread_local int tls4; +thread_local int tls5; +thread_local int tls6; +thread_local int tls7; +thread_local int tls8; +thread_local int tls9; +thread_local int tls10; +thread_local int tls11; +thread_local int tls12; +thread_local int tls13; +thread_local int tls14; +thread_local int tls15; +thread_local int tls16; + +extern "C" DLLEXPORT void initializeTLS() { + tls0=0; + tls1=0; + tls2=0; + tls3=0; + tls4=0; + tls5=0; + tls6=0; + tls7=0; + tls8=0; + tls9=0; + tls10=0; + tls11=0; + tls12=0; + tls13=0; + tls14=0; + tls15=0; + tls16=0; +}