From 4cdfc7e72c5c88010de0e43952b19fd48647848e Mon Sep 17 00:00:00 2001 From: Rui Ueyama Date: Wed, 1 Nov 2023 15:36:40 +0900 Subject: [PATCH] Do not emit text relocations for IFUNC symbols in PDEs IFUNC symbols are resolved at process startup by executing the function that the symbol points to. This is used to select the "best" function at runtime; for instance, the runtime may choose a faster version of memcpy that uses SIMD instructions if they are available on the current system. Thus, an IFUNC symbol has two addresses: the initial address (or the resolver's address) and the resolved address, which is the return value of the resolver. In position-independent executables (PIEs), function pointers are loaded from the GOT indirectly, and symbols are not directly referenced. In such executables, the initial value of the GOT slot for an IFUNC symbol contains the resolver address, and this is overwritten at runtime to the resolved address upon process startup. When user code takes a pointer to an IFUNC, it always reads the resolved address from GOT. In contrast, position-dependent executables (PDEs) may have instructions that directly refer to an IFUNC symbol, such as movabs on x86-64. The GOT entry for an IFUNC holds the resolved address, so any direct reference must also produce the resolved address to maintain pointer equality. (C/C++ standards require that two pointers must be equal if and only if they are taken for the same symbol.) Previously, we emitted text relocations to modify instruction operands. However, text relocations are undesirable and not always reliable. For example, on ARM64, multiple instructions are used to materialize a symbol's address, and it's not feasible to issue a dynamic relocation to alter those instructions since the dynamic loader generally can only modify 32-bit or 64-bit words. In this commit, I have adopted a different strategy. An IFUNC symbol now occupies two consecutive GOT slots in a PDE. The first slot holds the symbol's PLT address, and the second slot holds the resolved address. The PLT address is consistently used as the symbol's address throughout the process, while the second slot is used only by the PLT entry to jump to the resolved address. This method ensures pointer equality without the need to emit text relocations for IFUNC symbols in PDEs. --- elf/arch-arm32.cc | 2 +- elf/arch-arm64.cc | 2 +- elf/arch-i386.cc | 4 +- elf/arch-loongarch.cc | 2 +- elf/arch-m68k.cc | 2 +- elf/arch-ppc32.cc | 2 +- elf/arch-riscv.cc | 2 +- elf/arch-s390x.cc | 2 +- elf/arch-sh4.cc | 4 +- elf/arch-sparc64.cc | 2 +- elf/arch-x86-64.cc | 2 +- elf/input-sections.cc | 11 +++-- elf/mold.h | 27 +++++++++++ elf/output-chunks.cc | 23 ++++++++- test/elf/ifunc-address-equality-exported.sh | 33 +++++++++++++ test/elf/ifunc-address-equality.sh | 52 +++++++++++++++++++++ 16 files changed, 152 insertions(+), 20 deletions(-) create mode 100755 test/elf/ifunc-address-equality-exported.sh create mode 100755 test/elf/ifunc-address-equality.sh diff --git a/elf/arch-arm32.cc b/elf/arch-arm32.cc index db2de9846f..3ad58d8e7e 100644 --- a/elf/arch-arm32.cc +++ b/elf/arch-arm32.cc @@ -225,7 +225,7 @@ void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { memcpy(buf, plt_entry, sizeof(plt_entry)); - *(ul32 *)(buf + 12) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 12; + *(ul32 *)(buf + 12) = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 12; } // ARM does not use .eh_frame for exception handling. Instead, it uses diff --git a/elf/arch-arm64.cc b/elf/arch-arm64.cc index c0b5ff6791..75fddba0e9 100644 --- a/elf/arch-arm64.cc +++ b/elf/arch-arm64.cc @@ -93,7 +93,7 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { 0xd503'201f, // nop }; - u64 got = sym.get_got_addr(ctx); + u64 got = sym.get_got_pltgot_addr(ctx); u64 plt = sym.get_plt_addr(ctx); memcpy(buf, insn, sizeof(insn)); diff --git a/elf/arch-i386.cc b/elf/arch-i386.cc index 95184e6037..5c534256f1 100644 --- a/elf/arch-i386.cc +++ b/elf/arch-i386.cc @@ -164,7 +164,7 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); - *(ul32 *)(buf + 6) = sym.get_got_addr(ctx) - ctx.got->shdr.sh_addr; + *(ul32 *)(buf + 6) = sym.get_got_pltgot_addr(ctx) - ctx.got->shdr.sh_addr; } else { static const u8 insn[] = { 0xf3, 0x0f, 0x1e, 0xfb, // endbr32 @@ -172,7 +172,7 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); - *(ul32 *)(buf + 6) = sym.get_got_addr(ctx); + *(ul32 *)(buf + 6) = sym.get_got_pltgot_addr(ctx); } } diff --git a/elf/arch-loongarch.cc b/elf/arch-loongarch.cc index ff8d1849e9..fd17ad3e18 100644 --- a/elf/arch-loongarch.cc +++ b/elf/arch-loongarch.cc @@ -172,7 +172,7 @@ void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { - u64 got = sym.get_got_addr(ctx); + u64 got = sym.get_got_pltgot_addr(ctx); u64 plt = sym.get_plt_addr(ctx); memcpy(buf, E::is_64 ? plt_entry_64 : plt_entry_32, E::plt_size); diff --git a/elf/arch-m68k.cc b/elf/arch-m68k.cc index 97ef65f58d..f9de3be07f 100644 --- a/elf/arch-m68k.cc +++ b/elf/arch-m68k.cc @@ -52,7 +52,7 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { }; memcpy(buf, insn, sizeof(insn)); - *(ub32 *)(buf + 4) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 2; + *(ub32 *)(buf + 4) = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 2; } template <> diff --git a/elf/arch-ppc32.cc b/elf/arch-ppc32.cc index d75656ad9f..380c7e6bcc 100644 --- a/elf/arch-ppc32.cc +++ b/elf/arch-ppc32.cc @@ -120,7 +120,7 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { memcpy(buf, plt_entry, sizeof(plt_entry)); ub32 *loc = (ub32 *)buf; - i64 offset = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 8; + i64 offset = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 8; loc[4] |= higha(offset); loc[5] |= lo(offset); } diff --git a/elf/arch-riscv.cc b/elf/arch-riscv.cc index 644eb5ae8f..cf7b05c23a 100644 --- a/elf/arch-riscv.cc +++ b/elf/arch-riscv.cc @@ -198,7 +198,7 @@ void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { - u64 got = sym.get_got_addr(ctx); + u64 got = sym.get_got_pltgot_addr(ctx); u64 plt = sym.get_plt_addr(ctx); memcpy(buf, E::is_64 ? plt_entry_64 : plt_entry_32, E::plt_size); diff --git a/elf/arch-s390x.cc b/elf/arch-s390x.cc index 87ec3e8cf8..5fe7539d33 100644 --- a/elf/arch-s390x.cc +++ b/elf/arch-s390x.cc @@ -90,7 +90,7 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { }; memcpy(buf, insn, sizeof(insn)); - *(ub32 *)(buf + 2) = (sym.get_got_addr(ctx) - sym.get_plt_addr(ctx)) >> 1; + *(ub32 *)(buf + 2) = (sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx)) >> 1; } template <> diff --git a/elf/arch-sh4.cc b/elf/arch-sh4.cc index 248964d87c..46d611282f 100644 --- a/elf/arch-sh4.cc +++ b/elf/arch-sh4.cc @@ -169,7 +169,7 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { static_assert(sizeof(insn) == E::pltgot_size); memcpy(buf, insn, sizeof(insn)); - *(ul32 *)(buf + 8) = sym.get_got_addr(ctx) - ctx.got->shdr.sh_addr; + *(ul32 *)(buf + 8) = sym.get_got_pltgot_addr(ctx) - ctx.got->shdr.sh_addr; } else { static const u8 insn[] = { 0x01, 0xd0, // mov.l 1f, r0 @@ -181,7 +181,7 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { static_assert(sizeof(insn) == E::pltgot_size); memcpy(buf, insn, sizeof(insn)); - *(ul32 *)(buf + 8) = sym.get_got_addr(ctx); + *(ul32 *)(buf + 8) = sym.get_got_pltgot_addr(ctx); } } diff --git a/elf/arch-sparc64.cc b/elf/arch-sparc64.cc index c6c455b350..bebbe11d4e 100644 --- a/elf/arch-sparc64.cc +++ b/elf/arch-sparc64.cc @@ -115,7 +115,7 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { }; memcpy(buf, entry, sizeof(entry)); - *(ub64 *)(buf + 24) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 4; + *(ub64 *)(buf + 24) = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 4; } template <> diff --git a/elf/arch-x86-64.cc b/elf/arch-x86-64.cc index 698bd7e662..af86adecc4 100644 --- a/elf/arch-x86-64.cc +++ b/elf/arch-x86-64.cc @@ -98,7 +98,7 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { }; memcpy(buf, insn, sizeof(insn)); - *(ul32 *)(buf + 2) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 6; + *(ul32 *)(buf + 2) = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 6; } template <> diff --git a/elf/input-sections.cc b/elf/input-sections.cc index 5b1e56474e..864e72bb1a 100644 --- a/elf/input-sections.cc +++ b/elf/input-sections.cc @@ -7,7 +7,8 @@ namespace mold::elf { typedef enum { - NONE, ERROR, COPYREL, DYN_COPYREL, PLT, CPLT, DYN_CPLT, DYNREL, BASEREL, IFUNC, + NONE, ERROR, COPYREL, DYN_COPYREL, PLT, CPLT, DYN_CPLT, DYNREL, + BASEREL, IFUNC_DYNREL, } Action; template @@ -203,7 +204,7 @@ static void scan_rel(Context &ctx, InputSection &isec, Symbol &sym, if (!isec.is_relr_reloc(ctx, rel)) isec.file.num_dynrel++; break; - case IFUNC: + case IFUNC_DYNREL: // Create an IRELATIVE relocation for a GNU ifunc symbol. // // We usually create an IRELATIVE relocation in .got for each ifunc. @@ -273,7 +274,7 @@ static Action get_absrel_action(Context &ctx, Symbol &sym) { template static Action get_dyn_absrel_action(Context &ctx, Symbol &sym) { if (sym.is_ifunc()) - return IFUNC; + return ctx.arg.pic ? IFUNC_DYNREL : NONE; // This is a decision table for absolute relocations for the pointer // size data (e.g. R_X86_64_64). Unlike the absrel_table, we can emit @@ -291,7 +292,7 @@ static Action get_dyn_absrel_action(Context &ctx, Symbol &sym) { template static Action get_ppc64_toc_action(Context &ctx, Symbol &sym) { if (sym.is_ifunc()) - return IFUNC; + return IFUNC_DYNREL; // As a special case, we do not create copy relocations nor canonical // PLTs for .toc sections. PPC64's .toc is a compiler-generated @@ -406,7 +407,7 @@ static void apply_absrel(Context &ctx, InputSection &isec, case DYNREL: emit_abs_dynrel(); break; - case IFUNC: + case IFUNC_DYNREL: if constexpr (supports_ifunc) { u64 addr = sym.get_addr(ctx, NO_PLT) + A; *dynrel++ = ElfRel(P, E::R_IRELATIVE, 0, addr); diff --git a/elf/mold.h b/elf/mold.h index 0f97f06ec2..15a1b6c944 100644 --- a/elf/mold.h +++ b/elf/mold.h @@ -1940,6 +1940,7 @@ class Symbol { u64 get_tlsdesc_addr(Context &ctx) const; u64 get_plt_addr(Context &ctx) const; u64 get_opd_addr(Context &ctx) const; + u64 get_got_pltgot_addr(Context &ctx) const; void set_got_idx(Context &ctx, i32 idx); void set_gottp_idx(Context &ctx, i32 idx); @@ -1973,6 +1974,7 @@ class Symbol { bool is_relative() const { return !is_absolute(); } bool is_local(Context &ctx) const; bool is_ifunc() const { return get_type() == STT_GNU_IFUNC; } + bool is_pde_ifunc(Context &ctx) const; bool is_remaining_undef_weak() const; bool is_pcrel_linktime_const(Context &ctx) const; @@ -2574,6 +2576,25 @@ inline u64 Symbol::get_opd_addr(Context &ctx) const { get_opd_idx(ctx) * PPC64OpdSection::ENTRY_SIZE; } +template +inline u64 Symbol::get_got_pltgot_addr(Context &ctx) const { + // An ifunc symbol occupies two consecutive GOT slots in a + // position-dependent executable (PDE). The first slot contains the + // symbol's PLT address, and the second slot holds the resolved + // address. A PDE uses the ifunc symbol's PLT entry as the address + // for the symbol, akin to a canonical PLT. + // + // This function returns the address that the PLT entry should use + // to jump to the resolved address. + // + // Note that we don't use this function for PPC64. In PPC64, symbols + // are always accessed through the TOC table regardless of the + // -fno-PIE setting. We don't need canonical PLTs on the psABIs too. + if (is_pde_ifunc(ctx)) + return get_got_addr(ctx) + sizeof(Word); + return get_got_addr(ctx); +} + template inline void Symbol::set_got_idx(Context &ctx, i32 idx) { assert(aux_idx != -1); @@ -2702,6 +2723,12 @@ inline bool Symbol::is_local(Context &ctx) const { return !is_imported && !is_exported; } +template +inline bool Symbol::is_pde_ifunc(Context &ctx) const { + // Returns true if this is an ifunc tha uses two GOT slots + return is_ifunc() && !ctx.arg.pic && !is_ppc64; +} + // A remaining weak undefined symbol is promoted to a dynamic symbol // in DSO and resolved to 0 in an executable. This function returns // true if it's latter. diff --git a/elf/output-chunks.cc b/elf/output-chunks.cc index 1fda2a09d2..a65f0fa7f2 100644 --- a/elf/output-chunks.cc +++ b/elf/output-chunks.cc @@ -1082,7 +1082,14 @@ void OutputSection::populate_symtab(Context &ctx) { template void GotSection::add_got_symbol(Context &ctx, Symbol *sym) { sym->set_got_idx(ctx, this->shdr.sh_size / sizeof(Word)); - this->shdr.sh_size += sizeof(Word); + + // An IFUNC symbol uses two GOT slots in a position-dependent + // executable. + if (sym->is_pde_ifunc(ctx)) + this->shdr.sh_size += sizeof(Word) * 2; + else + this->shdr.sh_size += sizeof(Word); + got_syms.push_back(sym); } @@ -1176,7 +1183,12 @@ static std::vector> get_got_entries(Context &ctx) { // IFUNC always needs to be fixed up by the dynamic linker. if constexpr (supports_ifunc) { if (sym->is_ifunc()) { - add({idx, sym->get_addr(ctx, NO_PLT), E::R_IRELATIVE}); + if (sym->is_pde_ifunc(ctx)) { + add({idx, sym->get_plt_addr(ctx)}); + add({idx + 1, sym->get_addr(ctx, NO_PLT), E::R_IRELATIVE}); + } else { + add({idx, sym->get_addr(ctx, NO_PLT), E::R_IRELATIVE}); + } continue; } } @@ -1656,8 +1668,15 @@ ElfSym to_output_esym(Context &ctx, Symbol &sym, u32 st_name, esym.st_shndx = SHN_ABS; esym.st_value = sym.get_addr(ctx); } else if (sym.get_type() == STT_TLS) { + // TLS symbol shndx = get_st_shndx(sym); esym.st_value = sym.get_addr(ctx) - ctx.tls_begin; + } else if (sym.is_pde_ifunc(ctx)) { + // IFUNC symbol in PDE that uses two GOT slots + shndx = get_st_shndx(sym); + esym.st_type = STT_FUNC; + esym.st_visibility = sym.visibility; + esym.st_value = sym.get_addr(ctx); } else { shndx = get_st_shndx(sym); esym.st_visibility = sym.visibility; diff --git a/test/elf/ifunc-address-equality-exported.sh b/test/elf/ifunc-address-equality-exported.sh new file mode 100755 index 0000000000..74bb76b8b2 --- /dev/null +++ b/test/elf/ifunc-address-equality-exported.sh @@ -0,0 +1,33 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +supports_ifunc || skip + +cat < + +typedef void Func(); + +__attribute__((ifunc("resolve_foo"))) void foo(void); +void real_foo(void) { printf("foo "); } +Func *resolve_foo() { return real_foo; } + +Func *get_foo(); + +int main() { + printf("%p %p\n", foo, get_foo()); + foo(); + printf("\n"); +} +EOF + +$CC -B. -o $t/exe1 $t/c.o $t/b.so -no-pie +$QEMU $t/exe1 | grep -Eq '^(\S+) \1' diff --git a/test/elf/ifunc-address-equality.sh b/test/elf/ifunc-address-equality.sh new file mode 100755 index 0000000000..2ba8fdd3cf --- /dev/null +++ b/test/elf/ifunc-address-equality.sh @@ -0,0 +1,52 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +supports_ifunc || skip + +cat < + +typedef void Func(); + +__attribute__((ifunc("resolve_foo"))) void foo(void); +void real_foo(void) { printf("foo "); } +Func *resolve_foo() { return real_foo; } + +__attribute__((ifunc("resolve_bar"))) void bar(void); +void real_bar(void) { printf("bar "); } +Func *resolve_bar() { return real_bar; } +EOF + +cat < + +typedef void Func(); + +void foo(); +void bar(); +Func *get_foo(); +Func *get_bar(); + +int main() { + printf("%p %p %p %p\n", foo, get_foo(), bar, get_bar()); + foo(); + bar(); + printf("\n"); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -no-pie +$QEMU $t/exe1 | grep -Eq '^(\S+) \1 (\S+) \2' + +readelf --dynamic $t/exe1 > $t/log1 +! grep -q TEXTREL $t/log1 || false