From 57a19ac3365f1dc255e6f24fcb7afcde2ccef8f9 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 15 Aug 2024 13:34:03 +0200
Subject: [PATCH] Reapply "[lldb] Tolerate multiple compile units with the same
 DWO ID (#100577)" (#104041)

The only change vs. the first version of the patch is that I've made
DWARFUnit linking thread-safe/unit. This was necessary because, during
the
indexing step, two skeleton units could attempt to associate themselves
with the split unit.

The original commit message was:

I ran into this when LTO completely emptied two compile units, so they
ended up with the same hash (see #100375). Although, ideally, the
compiler would try to ensure we don't end up with a hash collision even
in this case, guaranteeing their absence is practically impossible. This
patch ensures this situation does not bring down lldb.
---
 .../Plugins/SymbolFile/DWARF/DWARFUnit.cpp    |  41 ++---
 .../Plugins/SymbolFile/DWARF/DWARFUnit.h      |   4 +-
 .../SymbolFile/DWARF/x86/dwp-hash-collision.s | 142 ++++++++++++++++++
 3 files changed, 168 insertions(+), 19 deletions(-)
 create mode 100644 lldb/test/Shell/SymbolFile/DWARF/x86/dwp-hash-collision.s
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index 66a762bf9b6854..81f937762e35a6 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -97,12 +97,14 @@ void DWARFUnit::ExtractUnitDIEIfNeeded() {
         *m_dwo_id, m_first_die.GetOffset()));
     return; // Can't fetch the compile unit from the dwo file.
   }
-  // If the skeleton compile unit gets its unit DIE parsed first, then this
-  // will fill in the DWO file's back pointer to this skeleton compile unit.
-  // If the DWO files get parsed on their own first the skeleton back link
-  // can be done manually in DWARFUnit::GetSkeletonCompileUnit() which will
-  // do a reverse lookup and cache the result.
-  dwo_cu->SetSkeletonUnit(this);
+
+  // Link the DWO unit to this object, if it hasn't been linked already (this
+  // can happen when we have an index, and the DWO unit is parsed first).
+  if (!dwo_cu->LinkToSkeletonUnit(*this)) {
+    SetDwoError(Status::createWithFormat(
+        "multiple compile units with Dwo ID {0:x16}", *m_dwo_id));
+    return;
+  }
 
   DWARFBaseDIE dwo_cu_die = dwo_cu->GetUnitDIEOnly();
   if (!dwo_cu_die.IsValid()) {
@@ -708,23 +710,28 @@ uint8_t DWARFUnit::GetAddressByteSize(const DWARFUnit *cu) {
 uint8_t DWARFUnit::GetDefaultAddressSize() { return 4; }
 
 DWARFCompileUnit *DWARFUnit::GetSkeletonUnit() {
-  if (m_skeleton_unit == nullptr && IsDWOUnit()) {
+  if (m_skeleton_unit.load() == nullptr && IsDWOUnit()) {
     SymbolFileDWARFDwo *dwo =
         llvm::dyn_cast_or_null<SymbolFileDWARFDwo>(&GetSymbolFileDWARF());
     // Do a reverse lookup if the skeleton compile unit wasn't set.
-    if (dwo)
-      m_skeleton_unit = dwo->GetBaseSymbolFile().GetSkeletonUnit(this);
+    DWARFUnit *candidate_skeleton_unit =
+        dwo ? dwo->GetBaseSymbolFile().GetSkeletonUnit(this) : nullptr;
+    if (candidate_skeleton_unit)
+      (void)LinkToSkeletonUnit(*candidate_skeleton_unit);
+    // Linking may fail due to a race, so be sure to return the actual value.
   }
-  return llvm::dyn_cast_or_null<DWARFCompileUnit>(m_skeleton_unit);
+  return llvm::dyn_cast_or_null<DWARFCompileUnit>(m_skeleton_unit.load());
 }
 
-void DWARFUnit::SetSkeletonUnit(DWARFUnit *skeleton_unit) {
-  // If someone is re-setting the skeleton compile unit backlink, make sure
-  // it is setting it to a valid value when it wasn't valid, or if the
-  // value in m_skeleton_unit was valid, it should be the same value.
-  assert(skeleton_unit);
-  assert(m_skeleton_unit == nullptr || m_skeleton_unit == skeleton_unit);
-  m_skeleton_unit = skeleton_unit;
+bool DWARFUnit::LinkToSkeletonUnit(DWARFUnit &skeleton_unit) {
+  DWARFUnit *expected_unit = nullptr;
+  if (m_skeleton_unit.compare_exchange_strong(expected_unit, &skeleton_unit))
+    return true;
+  if (expected_unit == &skeleton_unit) {
+    // Exchange failed because it already contained the right  value.
+    return true;
+  }
+  return false; // Already linked to a different unit.
 }
 
 bool DWARFUnit::Supports_DW_AT_APPLE_objc_complete_type() {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index 85c37971ced8e0..148932d67b908c 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -170,7 +170,7 @@ class DWARFUnit : public UserID {
   /// both cases correctly and avoids crashes.
   DWARFCompileUnit *GetSkeletonUnit();
 
-  void SetSkeletonUnit(DWARFUnit *skeleton_unit);
+  bool LinkToSkeletonUnit(DWARFUnit &skeleton_unit);
 
   bool Supports_DW_AT_APPLE_objc_complete_type();
 
@@ -308,7 +308,7 @@ class DWARFUnit : public UserID {
   const llvm::DWARFAbbreviationDeclarationSet *m_abbrevs = nullptr;
   lldb_private::CompileUnit *m_lldb_cu = nullptr;
   // If this is a DWO file, we have a backlink to our skeleton compile unit.
-  DWARFUnit *m_skeleton_unit = nullptr;
+  std::atomic<DWARFUnit *> m_skeleton_unit = nullptr;
   // The compile unit debug information entry item
   DWARFDebugInfoEntry::collection m_die_array;
   mutable llvm::sys::RWMutex m_die_array_mutex;
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-hash-collision.s b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-hash-collision.s
new file mode 100644
index 00000000000000..d626b4602ad58f
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-hash-collision.s
@@ -0,0 +1,142 @@
+## Test that lldb handles (mainly, that it doesn't crash) the situation where
+## two skeleton compile units have the same DWO ID (and try to claim the same
+## split unit from the DWP file. This can sometimes happen when the compile unit
+## is nearly empty (e.g. because LTO has optimized all of it away).
+
+# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s --defsym MAIN=0 > %t
+# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t.dwp
+# RUN: %lldb %t -o "image lookup -t my_enum_type" \
+# RUN:   -o "image dump separate-debug-info" -o exit | FileCheck %s
+
+## Check that we're able to access the type within the split unit (no matter
+## which skeleton unit it ends up associated with). Completely ignoring the unit
+## might also be reasonable.
+# CHECK: image lookup -t my_enum_type
+# CHECK: 1 match found
+# CHECK:      name = "my_enum_type", byte-size = 4, compiler_type = "enum my_enum_type {
+# CHECK-NEXT: }"
+#
+## Check that we get some indication of the error.
+# CHECK: image dump separate-debug-info
+# CHECK:      Dwo ID             Err Dwo Path
+# CHECK: 0xdeadbeefbaadf00d E   multiple compile units with Dwo ID 0xdeadbeefbaadf00d
+
+.set DWO_ID, 0xdeadbeefbaadf00d
+
+## The main file.
+.ifdef MAIN
+        .section        .debug_abbrev,"",@progbits
+        .byte   1                       # Abbreviation Code
+        .byte   74                      # DW_TAG_compile_unit
+        .byte   0                       # DW_CHILDREN_no
+        .byte   0x76                    # DW_AT_dwo_name
+        .byte   8                       # DW_FORM_string
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   0                       # EOM(3)
+
+
+        .section        .debug_info,"",@progbits
+.irpc I,01
+.Lcu_begin\I:
+        .long   .Ldebug_info_end\I-.Ldebug_info_start\I # Length of Unit
+.Ldebug_info_start\I:
+        .short  5                       # DWARF version number
+        .byte   4                       # DWARF Unit Type
+        .byte   8                       # Address Size (in bytes)
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .quad   DWO_ID                  # DWO id
+        .byte   1                       # Abbrev [1] DW_TAG_compile_unit
+        .ascii  "foo"
+        .byte   '0' + \I
+        .asciz  ".dwo\0"                # DW_AT_dwo_name
+.Ldebug_info_end\I:
+.endr
+
+.else
+## DWP file starts here.
+
+        .section        .debug_abbrev.dwo,"e",@progbits
+.LAbbrevBegin:
+        .byte   1                       # Abbreviation Code
+        .byte   17                      # DW_TAG_compile_unit
+        .byte   1                       # DW_CHILDREN_yes
+        .byte   37                      # DW_AT_producer
+        .byte   8                       # DW_FORM_string
+        .byte   19                      # DW_AT_language
+        .byte   5                       # DW_FORM_data2
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   2                       # Abbreviation Code
+        .byte   4                       # DW_TAG_enumeration_type
+        .byte   0                       # DW_CHILDREN_no
+        .byte   3                       # DW_AT_name
+        .byte   8                       # DW_FORM_string
+        .byte   73                      # DW_AT_type
+        .byte   19                      # DW_FORM_ref4
+        .byte   11                      # DW_AT_byte_size
+        .byte   11                      # DW_FORM_data1
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   4                       # Abbreviation Code
+        .byte   36                      # DW_TAG_base_type
+        .byte   0                       # DW_CHILDREN_no
+        .byte   3                       # DW_AT_name
+        .byte   8                       # DW_FORM_string
+        .byte   62                      # DW_AT_encoding
+        .byte   11                      # DW_FORM_data1
+        .byte   11                      # DW_AT_byte_size
+        .byte   11                      # DW_FORM_data1
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   0                       # EOM(3)
+.LAbbrevEnd:
+        .section        .debug_info.dwo,"e",@progbits
+.LCUBegin:
+.Lcu_begin1:
+        .long   .Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+        .short  5                       # DWARF version number
+        .byte   5                       # DWARF Unit Type
+        .byte   8                       # Address Size (in bytes)
+        .long   0                       # Offset Into Abbrev. Section
+        .quad   DWO_ID                  # DWO id
+        .byte   1                       # Abbrev [1] DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .short  12                      # DW_AT_language
+        .byte   2                       # Abbrev [2] DW_TAG_enumeration_type
+        .asciz  "my_enum_type"          # DW_AT_name
+        .long   .Lint-.Lcu_begin1       # DW_AT_type
+        .byte   4                       # DW_AT_byte_size
+.Lint:
+        .byte   4                       # Abbrev [4] DW_TAG_base_type
+        .asciz  "int"                   # DW_AT_name
+        .byte   5                       # DW_AT_encoding
+        .byte   4                       # DW_AT_byte_size
+        .byte   0                       # End Of Children Mark
+.Ldebug_info_end1:
+.LCUEnd:
+        .section .debug_cu_index, "", @progbits
+## Header:
+        .short 5                        # Version
+        .short 0                        # Padding
+        .long 2                         # Section count
+        .long 1                         # Unit count
+        .long 2                         # Slot count
+## Hash Table of Signatures:
+        .quad 0
+        .quad DWO_ID
+## Parallel Table of Indexes:
+        .long 0
+        .long 1
+## Table of Section Offsets:
+## Row 0:
+        .long 1                         # DW_SECT_INFO
+        .long 3                         # DW_SECT_ABBREV
+## Row 1:
+        .long 0                         # Offset in .debug_info.dwo
+        .long 0                         # Offset in .debug_abbrev.dwo
+## Table of Section Sizes:
+        .long .LCUEnd-.LCUBegin         # Size in .debug_info.dwo
+        .long .LAbbrevEnd-.LAbbrevBegin # Size in .debug_abbrev.dwo
+.endif