merge main into amd-staging

Change-Id: Ia7274ea51389854c15b80a31ca2facd516381ed1
ROCm · Aug 31, 2024 · 54334d7 · 54334d7
2 parents fd2e455 + ef50970
commit 54334d7
Show file tree

Hide file tree

Showing 73 changed files with 4,388 additions and 2,488 deletions.
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
@@ -13,6 +13,9 @@ jobs:
  code_formatter:
  runs-on: ubuntu-latest
  timeout-minutes: 30
+ concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+ cancel-in-progress: true
  if: github.repository == 'llvm/llvm-project'
  steps:
  - name: Fetch LLVM sources

diff --git a/.github/workflows/release-binaries-save-stage/action.yml b/.github/workflows/release-binaries-save-stage/action.yml
@@ -10,6 +10,9 @@ inputs:
  required: true
  type: 'string'
 
+permissions:
+ contents: read
+
 runs:
  using: "composite"
  steps:
@@ -18,6 +21,9 @@ runs:
  - name: Package Build and Source Directories
  shell: bash
  run: |
+ # Remove .git/config to avoid leaking GITHUB_TOKEN stored there.
+ # See https://unit42.paloaltonetworks.com/github-repo-artifacts-leak-tokens/
+ rm -Rf .git/config
  # Windows does not support symlinks, so we need to dereference them.
  tar --exclude build/ ${{ (runner.os == 'Windows' && '-h') || '' }} -c . | zstd -T0 -c > ../llvm-project.tar.zst
  mv ../llvm-project.tar.zst .

diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -382,7 +382,7 @@ foreach(target riscv32-unknown-elf)
  foreach(lang C;CXX;ASM)
  # TODO: The preprocessor defines workaround various issues in libc and libc++ integration.
  # These should be addressed and removed over time.
- set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+ set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
  endforeach()
  foreach(type SHARED;MODULE;EXE)
  set(RUNTIMES_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "")

diff --git a/clang/docs/HLSL/ExpectedDifferences.rst b/clang/docs/HLSL/ExpectedDifferences.rst
@@ -54,6 +54,19 @@ HLSL 202x based on proposal
 and
 `0008 <https://github.com/microsoft/hlsl-specs/blob/main/proposals/0008-non-member-operator-overloading.md>`_.
 
+The largest difference between Clang and DXC's overload resolution is the
+algorithm used for identifying best-match overloads. There are more details
+about the algorithmic differences in the :ref:`multi_argument_overloads` section
+below. There are three high level differences that should be highlighted:
+
+* **There should be no cases** where DXC and Clang both successfully
+ resolve an overload where the resolved overload is different between the two.
+* There are cases where Clang will successfully resolve an overload that DXC
+ wouldn't because we've trimmed the overload set in Clang to remove ambiguity.
+* There are cases where DXC will successfully resolve an overload that Clang
+ will not for two reasons: (1) DXC only generates partial overload sets for
+ builtin functions and (2) DXC resolves cases that probably should be ambiguous.
+
 Clang's implementation extends standard overload resolution rules to HLSL
 library functionality. This causes subtle changes in overload resolution
 behavior between Clang and DXC. Some examples include:
@@ -71,18 +84,23 @@ behavior between Clang and DXC. Some examples include:
  uint U;
  int I;
  float X, Y, Z;
- double3 A, B;
+ double3 R, G;
  }
 
- void twoParams(int, int);
- void twoParams(float, float);
+ void takesSingleDouble(double);
+ void takesSingleDouble(vector<double, 1>);
+
+ void scalarOrVector(double);
+ void scalarOrVector(vector<double, 2>);
 
  export void call() {
- halfOrInt16(U); // DXC: Fails with call ambiguous between int16_t and uint16_t overloads
- // Clang: Resolves to halfOrInt16(uint16_t).
- halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t).
  half H;
+ halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t).
+
  #ifndef IGNORE_ERRORS
+ halfOrInt16(U); // All: Fails with call ambiguous between int16_t and uint16_t
+ // overloads
+
  // asfloat16 is a builtin with overloads for half, int16_t, and uint16_t.
  H = asfloat16(I); // DXC: Fails to resolve overload for int.
  // Clang: Resolves to asfloat16(int16_t).
@@ -94,21 +112,28 @@ behavior between Clang and DXC. Some examples include:
 
  takesDoubles(X, Y, Z); // Works on all compilers
  #ifndef IGNORE_ERRORS
- fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to double.
+ fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to
+ // double.
  // Clang: Resolves to fma(double,double,double).
- #endif
 
- double D = dot(A, B); // DXC: Resolves to dot(double3, double3), fails DXIL Validation.
+ double D = dot(R, G); // DXC: Resolves to dot(double3, double3), fails DXIL Validation.
  // FXC: Expands to compute double dot product with fmul/fadd
- // Clang: Resolves to dot(float3, float3), emits conversion warnings.
+ // Clang: Fails to resolve as ambiguous against
+ // dot(half, half) or dot(float, float)
+ #endif
 
  #ifndef IGNORE_ERRORS
  tan(B); // DXC: resolves to tan(float).
  // Clang: Fails to resolve, ambiguous between integer types.
 
- twoParams(I, X); // DXC: resolves twoParams(int, int).
- // Clang: Fails to resolve ambiguous conversions.
  #endif
+
+ double D;
+ takesSingleDouble(D); // All: Fails to resolve ambiguous conversions.
+ takesSingleDouble(R); // All: Fails to resolve ambiguous conversions.
+
+ scalarOrVector(D); // All: Resolves to scalarOrVector(double).
+ scalarOrVector(R); // All: Fails to resolve ambiguous conversions.
  }
 
 .. note::
@@ -119,3 +144,75 @@ behavior between Clang and DXC. Some examples include:
  diagnostic notifying the user of the conversion rather than silently altering
  precision relative to the other overloads (as FXC does) or generating code
  that will fail validation (as DXC does).
+
+.. _multi_argument_overloads:
+
+Multi-Argument Overloads
+------------------------
+
+In addition to the differences in single-element conversions, Clang and DXC
+differ dramatically in multi-argument overload resolution. C++ multi-argument
+overload resolution behavior (or something very similar) is required to
+implement
+`non-member operator overloading <https://github.com/microsoft/hlsl-specs/blob/main/proposals/0008-non-member-operator-overloading.md>`_.
+
+Clang adopts the C++ inspired language from the
+`draft HLSL specification <https://microsoft.github.io/hlsl-specs/specs/hlsl.pdf>`_,
+where an overload ``f1`` is a better candidate than ``f2`` if for all arguments the
+conversion sequences is not worse than the corresponding conversion sequence and
+for at least one argument it is better.
+
+.. code-block:: c++
+
+ cbuffer CB {
+ int I;
+ float X;
+ float4 V;
+ }
+
+ void twoParams(int, int);
+ void twoParams(float, float);
+ void threeParams(float, float, float);
+ void threeParams(float4, float4, float4);
+
+ export void call() {
+ twoParams(I, X); // DXC: resolves twoParams(int, int).
+ // Clang: Fails to resolve ambiguous conversions.
+
+ threeParams(X, V, V); // DXC: resolves threeParams(float4, float4, float4).
+ // Clang: Fails to resolve ambiguous conversions.
+ }
+
+For the examples above since ``twoParams`` called with mixed parameters produces
+implicit conversion sequences that are { ExactMatch, FloatingIntegral } and {
+FloatingIntegral, ExactMatch }. In both cases an argument has a worse conversion
+in the other sequence, so the overload is ambiguous.
+
+In the ``threeParams`` example the sequences are { ExactMatch, VectorTruncation,
+VectorTruncation } or { VectorSplat, ExactMatch, ExactMatch }, again in both
+cases at least one parameter has a worse conversion in the other sequence, so
+the overload is ambiguous.
+
+.. note::
+
+ The behavior of DXC documented below is undocumented so this is gleaned from
+ observation and a bit of reading the source.
+
+DXC's approach for determining the best overload produces an integer score value
+for each implicit conversion sequence for each argument expression. Scores for
+casts are based on a bitmask construction that is complicated to reverse
+engineer. It seems that:
+
+* Exact match is 0
+* Dimension increase is 1
+* Promotion is 2
+* Integral -> Float conversion is 4
+* Float -> Integral conversion is 8
+* Cast is 16
+
+The masks are or'd against each other to produce a score for the cast.
+
+The scores of each conversion sequence are then summed to generate a score for
+the overload candidate. The overload candidate with the lowest score is the best
+candidate. If more than one overload are matched for the lowest score the call
+is ambiguous.
diff --git a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
@@ -21,17 +21,6 @@
 using namespace clang;
 using namespace clang::interp;
 
-/// Unevaluated builtins don't get their arguments put on the stack
-/// automatically. They instead operate on the AST of their Call
-/// Expression.
-/// Similar information is available via ASTContext::BuiltinInfo,
-/// but that is not correct for our use cases.
-static bool isUnevaluatedBuiltin(unsigned BuiltinID) {
- return BuiltinID == Builtin::BI__builtin_classify_type ||
- BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size ||
- BuiltinID == Builtin::BI__builtin_constant_p;
-}
-
 Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
 
  // Manually created functions that haven't been assigned proper
@@ -147,14 +136,11 @@ Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
  // Create a handle over the emitted code.
  Function *Func = P.getFunction(FuncDecl);
  if (!Func) {
- bool IsUnevaluatedBuiltin = false;
- if (unsigned BI = FuncDecl->getBuiltinID())
- IsUnevaluatedBuiltin = isUnevaluatedBuiltin(BI);
-
+ unsigned BuiltinID = FuncDecl->getBuiltinID();
  Func =
  P.createFunction(FuncDecl, ParamOffset, std::move(ParamTypes),
  std::move(ParamDescriptors), std::move(ParamOffsets),
- HasThisPointer, HasRVO, IsUnevaluatedBuiltin);
+ HasThisPointer, HasRVO, BuiltinID);
  }
 
  assert(Func);

diff --git a/clang/lib/AST/ByteCode/Function.cpp b/clang/lib/AST/ByteCode/Function.cpp
@@ -20,11 +20,10 @@ Function::Function(Program &P, FunctionDeclTy Source, unsigned ArgSize,
  llvm::SmallVectorImpl<PrimType> &&ParamTypes,
  llvm::DenseMap<unsigned, ParamDescriptor> &&Params,
  llvm::SmallVectorImpl<unsigned> &&ParamOffsets,
- bool HasThisPointer, bool HasRVO, bool UnevaluatedBuiltin)
+ bool HasThisPointer, bool HasRVO, unsigned BuiltinID)
  : P(P), Source(Source), ArgSize(ArgSize), ParamTypes(std::move(ParamTypes)),
  Params(std::move(Params)), ParamOffsets(std::move(ParamOffsets)),
- HasThisPointer(HasThisPointer), HasRVO(HasRVO),
- IsUnevaluatedBuiltin(UnevaluatedBuiltin) {
+ HasThisPointer(HasThisPointer), HasRVO(HasRVO), BuiltinID(BuiltinID) {
  if (const auto *F = Source.dyn_cast<const FunctionDecl *>())
  Variadic = F->isVariadic();
 }
@@ -53,3 +52,18 @@ bool Function::isVirtual() const {
  return M->isVirtual();
  return false;
 }
+
+/// Unevaluated builtins don't get their arguments put on the stack
+/// automatically. They instead operate on the AST of their Call
+/// Expression.
+/// Similar information is available via ASTContext::BuiltinInfo,
+/// but that is not correct for our use cases.
+static bool isUnevaluatedBuiltin(unsigned BuiltinID) {
+ return BuiltinID == Builtin::BI__builtin_classify_type ||
+ BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size ||
+ BuiltinID == Builtin::BI__builtin_constant_p;
+}
+
+bool Function::isUnevaluatedBuiltin() const {
+ return ::isUnevaluatedBuiltin(BuiltinID);
+}
diff --git a/clang/lib/AST/ByteCode/Function.h b/clang/lib/AST/ByteCode/Function.h
@@ -193,15 +193,11 @@ class Function final {
 
  bool isVariadic() const { return Variadic; }
 
- unsigned getBuiltinID() const {
- return Source.get<const FunctionDecl *>()->getBuiltinID();
- }
+ unsigned getBuiltinID() const { return BuiltinID; }
 
- bool isBuiltin() const {
- return Source.get<const FunctionDecl *>()->getBuiltinID() != 0;
- }
+ bool isBuiltin() const { return getBuiltinID() != 0; }
 
- bool isUnevaluatedBuiltin() const { return IsUnevaluatedBuiltin; }
+ bool isUnevaluatedBuiltin() const;
 
  unsigned getNumParams() const { return ParamTypes.size(); }
 
@@ -232,7 +228,7 @@ class Function final {
  llvm::SmallVectorImpl<PrimType> &&ParamTypes,
  llvm::DenseMap<unsigned, ParamDescriptor> &&Params,
  llvm::SmallVectorImpl<unsigned> &&ParamOffsets, bool HasThisPointer,
- bool HasRVO, bool UnevaluatedBuiltin);
+ bool HasRVO, unsigned BuiltinID);
 
  /// Sets the code of a function.
  void setCode(unsigned NewFrameSize, std::vector<std::byte> &&NewCode,
@@ -289,7 +285,7 @@ class Function final {
  bool HasBody = false;
  bool Defined = false;
  bool Variadic = false;
- bool IsUnevaluatedBuiltin = false;
+ unsigned BuiltinID = 0;
 
 public:
  /// Dumps the disassembled bytecode to \c llvm::errs().

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2682,6 +2682,7 @@ void UnwrappedLineParser::parseSquare(bool LambdaIntroducer) {
  break;
  }
  case tok::at:
+ case tok::colon:
  nextToken();
  if (FormatTok->is(tok::l_brace)) {
  nextToken();

diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3286,6 +3286,15 @@ TEST_F(TokenAnnotatorTest, BlockLBrace) {
  EXPECT_BRACE_KIND(Tokens[4], BK_Block);
  EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_BlockLBrace);
  EXPECT_BRACE_KIND(Tokens[5], BK_Block);
+
+ Tokens = annotate("[foo bar:{{0, 1}} baz:baz];",
+ getLLVMStyle(FormatStyle::LK_ObjC));
+ ASSERT_EQ(Tokens.size(), 17u) << Tokens;
+ EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_Unknown); // Not TT_BlockLBrace.
+ EXPECT_BRACE_KIND(Tokens[4], BK_Unknown); // Not BK_Block.
+ EXPECT_BRACE_KIND(Tokens[5], BK_BracedInit);
+ EXPECT_BRACE_KIND(Tokens[9], BK_Unknown); // Not BK_Block.
+ EXPECT_BRACE_KIND(Tokens[10], BK_Unknown); // Not BK_Block.
 }
 
 TEST_F(TokenAnnotatorTest, SwitchExpression) {

diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
@@ -478,6 +478,20 @@ void Fortran::lower::createGlobalInitialization(
  builder.restoreInsertionPoint(insertPt);
 }
 
+static unsigned getAllocatorIdx(cuf::DataAttributeAttr dataAttr) {
+ if (dataAttr) {
+ if (dataAttr.getValue() == cuf::DataAttribute::Pinned)
+ return kPinnedAllocatorPos;
+ if (dataAttr.getValue() == cuf::DataAttribute::Device)
+ return kDeviceAllocatorPos;
+ if (dataAttr.getValue() == cuf::DataAttribute::Managed)
+ return kManagedAllocatorPos;
+ if (dataAttr.getValue() == cuf::DataAttribute::Unified)
+ return kUnifiedAllocatorPos;
+ }
+ return kDefaultAllocator;
+}
+
 /// Create the global op and its init if it has one
 static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
  const Fortran::lower::pft::Variable &var,
@@ -540,8 +554,10 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
  // Create unallocated/disassociated descriptor if no explicit init
  Fortran::lower::createGlobalInitialization(
  builder, global, [&](fir::FirOpBuilder &b) {
- mlir::Value box =
- fir::factory::createUnallocatedBox(b, loc, symTy, std::nullopt);
+ mlir::Value box = fir::factory::createUnallocatedBox(
+ b, loc, symTy,
+ /*nonDeferredParams=*/std::nullopt,
+ /*typeSourceBox=*/{}, getAllocatorIdx(dataAttr));
  b.create<fir::HasValueOp>(loc, box);
  });
  }

diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -2,6 +2,21 @@
 
 ! Test lowering of CUDA allocatable allocate/deallocate statements.
 
+module globals
+ real, device, allocatable :: a_device(:)
+ real, managed, allocatable :: a_managed(:)
+ real, pinned, allocatable :: a_pinned(:)
+end module
+
+! CHECK-LABEL: fir.global @_QMglobalsEa_device {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+
+! CHECK-LABEL: fir.global @_QMglobalsEa_managed {data_attr = #cuf.cuda<managed>} : !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 3 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+
+! CHECK-LABEL: fir.global @_QMglobalsEa_pinned {data_attr = #cuf.cuda<pinned>} : !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 1 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+
 subroutine sub1()
  real, allocatable, device :: a(:)
  allocate(a(10))