Skip to content

Commit

Permalink
merge main into amd-staging
Browse files Browse the repository at this point in the history
Change-Id: Ia7274ea51389854c15b80a31ca2facd516381ed1
  • Loading branch information
Jenkins committed Aug 31, 2024
2 parents fd2e455 + ef50970 commit 54334d7
Show file tree
Hide file tree
Showing 73 changed files with 4,388 additions and 2,488 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/pr-code-format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ jobs:
code_formatter:
runs-on: ubuntu-latest
timeout-minutes: 30
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: true
if: github.repository == 'llvm/llvm-project'
steps:
- name: Fetch LLVM sources
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/release-binaries-save-stage/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ inputs:
required: true
type: 'string'

permissions:
contents: read

runs:
using: "composite"
steps:
Expand All @@ -18,6 +21,9 @@ runs:
- name: Package Build and Source Directories
shell: bash
run: |
# Remove .git/config to avoid leaking GITHUB_TOKEN stored there.
# See https://unit42.paloaltonetworks.com/github-repo-artifacts-leak-tokens/
rm -Rf .git/config
# Windows does not support symlinks, so we need to dereference them.
tar --exclude build/ ${{ (runner.os == 'Windows' && '-h') || '' }} -c . | zstd -T0 -c > ../llvm-project.tar.zst
mv ../llvm-project.tar.zst .
Expand Down
2 changes: 1 addition & 1 deletion clang/cmake/caches/Fuchsia-stage2.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ foreach(target riscv32-unknown-elf)
foreach(lang C;CXX;ASM)
# TODO: The preprocessor defines workaround various issues in libc and libc++ integration.
# These should be addressed and removed over time.
set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
endforeach()
foreach(type SHARED;MODULE;EXE)
set(RUNTIMES_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "")
Expand Down
121 changes: 109 additions & 12 deletions clang/docs/HLSL/ExpectedDifferences.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@ HLSL 202x based on proposal
and
`0008 <https://github.com/microsoft/hlsl-specs/blob/main/proposals/0008-non-member-operator-overloading.md>`_.

The largest difference between Clang and DXC's overload resolution is the
algorithm used for identifying best-match overloads. There are more details
about the algorithmic differences in the :ref:`multi_argument_overloads` section
below. There are three high level differences that should be highlighted:

* **There should be no cases** where DXC and Clang both successfully
resolve an overload where the resolved overload is different between the two.
* There are cases where Clang will successfully resolve an overload that DXC
wouldn't because we've trimmed the overload set in Clang to remove ambiguity.
* There are cases where DXC will successfully resolve an overload that Clang
will not for two reasons: (1) DXC only generates partial overload sets for
builtin functions and (2) DXC resolves cases that probably should be ambiguous.

Clang's implementation extends standard overload resolution rules to HLSL
library functionality. This causes subtle changes in overload resolution
behavior between Clang and DXC. Some examples include:
Expand All @@ -71,18 +84,23 @@ behavior between Clang and DXC. Some examples include:
uint U;
int I;
float X, Y, Z;
double3 A, B;
double3 R, G;
}

void twoParams(int, int);
void twoParams(float, float);
void takesSingleDouble(double);
void takesSingleDouble(vector<double, 1>);

void scalarOrVector(double);
void scalarOrVector(vector<double, 2>);

export void call() {
halfOrInt16(U); // DXC: Fails with call ambiguous between int16_t and uint16_t overloads
// Clang: Resolves to halfOrInt16(uint16_t).
halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t).
half H;
halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t).

#ifndef IGNORE_ERRORS
halfOrInt16(U); // All: Fails with call ambiguous between int16_t and uint16_t
// overloads

// asfloat16 is a builtin with overloads for half, int16_t, and uint16_t.
H = asfloat16(I); // DXC: Fails to resolve overload for int.
// Clang: Resolves to asfloat16(int16_t).
Expand All @@ -94,21 +112,28 @@ behavior between Clang and DXC. Some examples include:

takesDoubles(X, Y, Z); // Works on all compilers
#ifndef IGNORE_ERRORS
fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to double.
fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to
// double.
// Clang: Resolves to fma(double,double,double).
#endif

double D = dot(A, B); // DXC: Resolves to dot(double3, double3), fails DXIL Validation.
double D = dot(R, G); // DXC: Resolves to dot(double3, double3), fails DXIL Validation.
// FXC: Expands to compute double dot product with fmul/fadd
// Clang: Resolves to dot(float3, float3), emits conversion warnings.
// Clang: Fails to resolve as ambiguous against
// dot(half, half) or dot(float, float)
#endif

#ifndef IGNORE_ERRORS
tan(B); // DXC: resolves to tan(float).
// Clang: Fails to resolve, ambiguous between integer types.

twoParams(I, X); // DXC: resolves twoParams(int, int).
// Clang: Fails to resolve ambiguous conversions.
#endif

double D;
takesSingleDouble(D); // All: Fails to resolve ambiguous conversions.
takesSingleDouble(R); // All: Fails to resolve ambiguous conversions.

scalarOrVector(D); // All: Resolves to scalarOrVector(double).
scalarOrVector(R); // All: Fails to resolve ambiguous conversions.
}

.. note::
Expand All @@ -119,3 +144,75 @@ behavior between Clang and DXC. Some examples include:
diagnostic notifying the user of the conversion rather than silently altering
precision relative to the other overloads (as FXC does) or generating code
that will fail validation (as DXC does).

.. _multi_argument_overloads:

Multi-Argument Overloads
------------------------

In addition to the differences in single-element conversions, Clang and DXC
differ dramatically in multi-argument overload resolution. C++ multi-argument
overload resolution behavior (or something very similar) is required to
implement
`non-member operator overloading <https://github.com/microsoft/hlsl-specs/blob/main/proposals/0008-non-member-operator-overloading.md>`_.

Clang adopts the C++ inspired language from the
`draft HLSL specification <https://microsoft.github.io/hlsl-specs/specs/hlsl.pdf>`_,
where an overload ``f1`` is a better candidate than ``f2`` if for all arguments the
conversion sequences is not worse than the corresponding conversion sequence and
for at least one argument it is better.

.. code-block:: c++

cbuffer CB {
int I;
float X;
float4 V;
}

void twoParams(int, int);
void twoParams(float, float);
void threeParams(float, float, float);
void threeParams(float4, float4, float4);

export void call() {
twoParams(I, X); // DXC: resolves twoParams(int, int).
// Clang: Fails to resolve ambiguous conversions.

threeParams(X, V, V); // DXC: resolves threeParams(float4, float4, float4).
// Clang: Fails to resolve ambiguous conversions.
}

For the examples above since ``twoParams`` called with mixed parameters produces
implicit conversion sequences that are { ExactMatch, FloatingIntegral } and {
FloatingIntegral, ExactMatch }. In both cases an argument has a worse conversion
in the other sequence, so the overload is ambiguous.

In the ``threeParams`` example the sequences are { ExactMatch, VectorTruncation,
VectorTruncation } or { VectorSplat, ExactMatch, ExactMatch }, again in both
cases at least one parameter has a worse conversion in the other sequence, so
the overload is ambiguous.

.. note::

The behavior of DXC documented below is undocumented so this is gleaned from
observation and a bit of reading the source.

DXC's approach for determining the best overload produces an integer score value
for each implicit conversion sequence for each argument expression. Scores for
casts are based on a bitmask construction that is complicated to reverse
engineer. It seems that:

* Exact match is 0
* Dimension increase is 1
* Promotion is 2
* Integral -> Float conversion is 4
* Float -> Integral conversion is 8
* Cast is 16

The masks are or'd against each other to produce a score for the cast.

The scores of each conversion sequence are then summed to generate a score for
the overload candidate. The overload candidate with the lowest score is the best
candidate. If more than one overload are matched for the lowest score the call
is ambiguous.
18 changes: 2 additions & 16 deletions clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,6 @@
using namespace clang;
using namespace clang::interp;

/// Unevaluated builtins don't get their arguments put on the stack
/// automatically. They instead operate on the AST of their Call
/// Expression.
/// Similar information is available via ASTContext::BuiltinInfo,
/// but that is not correct for our use cases.
static bool isUnevaluatedBuiltin(unsigned BuiltinID) {
return BuiltinID == Builtin::BI__builtin_classify_type ||
BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size ||
BuiltinID == Builtin::BI__builtin_constant_p;
}

Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {

// Manually created functions that haven't been assigned proper
Expand Down Expand Up @@ -147,14 +136,11 @@ Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
// Create a handle over the emitted code.
Function *Func = P.getFunction(FuncDecl);
if (!Func) {
bool IsUnevaluatedBuiltin = false;
if (unsigned BI = FuncDecl->getBuiltinID())
IsUnevaluatedBuiltin = isUnevaluatedBuiltin(BI);

unsigned BuiltinID = FuncDecl->getBuiltinID();
Func =
P.createFunction(FuncDecl, ParamOffset, std::move(ParamTypes),
std::move(ParamDescriptors), std::move(ParamOffsets),
HasThisPointer, HasRVO, IsUnevaluatedBuiltin);
HasThisPointer, HasRVO, BuiltinID);
}

assert(Func);
Expand Down
20 changes: 17 additions & 3 deletions clang/lib/AST/ByteCode/Function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,10 @@ Function::Function(Program &P, FunctionDeclTy Source, unsigned ArgSize,
llvm::SmallVectorImpl<PrimType> &&ParamTypes,
llvm::DenseMap<unsigned, ParamDescriptor> &&Params,
llvm::SmallVectorImpl<unsigned> &&ParamOffsets,
bool HasThisPointer, bool HasRVO, bool UnevaluatedBuiltin)
bool HasThisPointer, bool HasRVO, unsigned BuiltinID)
: P(P), Source(Source), ArgSize(ArgSize), ParamTypes(std::move(ParamTypes)),
Params(std::move(Params)), ParamOffsets(std::move(ParamOffsets)),
HasThisPointer(HasThisPointer), HasRVO(HasRVO),
IsUnevaluatedBuiltin(UnevaluatedBuiltin) {
HasThisPointer(HasThisPointer), HasRVO(HasRVO), BuiltinID(BuiltinID) {
if (const auto *F = Source.dyn_cast<const FunctionDecl *>())
Variadic = F->isVariadic();
}
Expand Down Expand Up @@ -53,3 +52,18 @@ bool Function::isVirtual() const {
return M->isVirtual();
return false;
}

/// Unevaluated builtins don't get their arguments put on the stack
/// automatically. They instead operate on the AST of their Call
/// Expression.
/// Similar information is available via ASTContext::BuiltinInfo,
/// but that is not correct for our use cases.
static bool isUnevaluatedBuiltin(unsigned BuiltinID) {
return BuiltinID == Builtin::BI__builtin_classify_type ||
BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size ||
BuiltinID == Builtin::BI__builtin_constant_p;
}

bool Function::isUnevaluatedBuiltin() const {
return ::isUnevaluatedBuiltin(BuiltinID);
}
14 changes: 5 additions & 9 deletions clang/lib/AST/ByteCode/Function.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,15 +193,11 @@ class Function final {

bool isVariadic() const { return Variadic; }

unsigned getBuiltinID() const {
return Source.get<const FunctionDecl *>()->getBuiltinID();
}
unsigned getBuiltinID() const { return BuiltinID; }

bool isBuiltin() const {
return Source.get<const FunctionDecl *>()->getBuiltinID() != 0;
}
bool isBuiltin() const { return getBuiltinID() != 0; }

bool isUnevaluatedBuiltin() const { return IsUnevaluatedBuiltin; }
bool isUnevaluatedBuiltin() const;

unsigned getNumParams() const { return ParamTypes.size(); }

Expand Down Expand Up @@ -232,7 +228,7 @@ class Function final {
llvm::SmallVectorImpl<PrimType> &&ParamTypes,
llvm::DenseMap<unsigned, ParamDescriptor> &&Params,
llvm::SmallVectorImpl<unsigned> &&ParamOffsets, bool HasThisPointer,
bool HasRVO, bool UnevaluatedBuiltin);
bool HasRVO, unsigned BuiltinID);

/// Sets the code of a function.
void setCode(unsigned NewFrameSize, std::vector<std::byte> &&NewCode,
Expand Down Expand Up @@ -289,7 +285,7 @@ class Function final {
bool HasBody = false;
bool Defined = false;
bool Variadic = false;
bool IsUnevaluatedBuiltin = false;
unsigned BuiltinID = 0;

public:
/// Dumps the disassembled bytecode to \c llvm::errs().
Expand Down
1 change: 1 addition & 0 deletions clang/lib/Format/UnwrappedLineParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2682,6 +2682,7 @@ void UnwrappedLineParser::parseSquare(bool LambdaIntroducer) {
break;
}
case tok::at:
case tok::colon:
nextToken();
if (FormatTok->is(tok::l_brace)) {
nextToken();
Expand Down
9 changes: 9 additions & 0 deletions clang/unittests/Format/TokenAnnotatorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3286,6 +3286,15 @@ TEST_F(TokenAnnotatorTest, BlockLBrace) {
EXPECT_BRACE_KIND(Tokens[4], BK_Block);
EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_BlockLBrace);
EXPECT_BRACE_KIND(Tokens[5], BK_Block);

Tokens = annotate("[foo bar:{{0, 1}} baz:baz];",
getLLVMStyle(FormatStyle::LK_ObjC));
ASSERT_EQ(Tokens.size(), 17u) << Tokens;
EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_Unknown); // Not TT_BlockLBrace.
EXPECT_BRACE_KIND(Tokens[4], BK_Unknown); // Not BK_Block.
EXPECT_BRACE_KIND(Tokens[5], BK_BracedInit);
EXPECT_BRACE_KIND(Tokens[9], BK_Unknown); // Not BK_Block.
EXPECT_BRACE_KIND(Tokens[10], BK_Unknown); // Not BK_Block.
}

TEST_F(TokenAnnotatorTest, SwitchExpression) {
Expand Down
20 changes: 18 additions & 2 deletions flang/lib/Lower/ConvertVariable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,20 @@ void Fortran::lower::createGlobalInitialization(
builder.restoreInsertionPoint(insertPt);
}

static unsigned getAllocatorIdx(cuf::DataAttributeAttr dataAttr) {
if (dataAttr) {
if (dataAttr.getValue() == cuf::DataAttribute::Pinned)
return kPinnedAllocatorPos;
if (dataAttr.getValue() == cuf::DataAttribute::Device)
return kDeviceAllocatorPos;
if (dataAttr.getValue() == cuf::DataAttribute::Managed)
return kManagedAllocatorPos;
if (dataAttr.getValue() == cuf::DataAttribute::Unified)
return kUnifiedAllocatorPos;
}
return kDefaultAllocator;
}

/// Create the global op and its init if it has one
static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
const Fortran::lower::pft::Variable &var,
Expand Down Expand Up @@ -540,8 +554,10 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
// Create unallocated/disassociated descriptor if no explicit init
Fortran::lower::createGlobalInitialization(
builder, global, [&](fir::FirOpBuilder &b) {
mlir::Value box =
fir::factory::createUnallocatedBox(b, loc, symTy, std::nullopt);
mlir::Value box = fir::factory::createUnallocatedBox(
b, loc, symTy,
/*nonDeferredParams=*/std::nullopt,
/*typeSourceBox=*/{}, getAllocatorIdx(dataAttr));
b.create<fir::HasValueOp>(loc, box);
});
}
Expand Down
15 changes: 15 additions & 0 deletions flang/test/Lower/CUDA/cuda-allocatable.cuf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,21 @@

! Test lowering of CUDA allocatable allocate/deallocate statements.

module globals
real, device, allocatable :: a_device(:)
real, managed, allocatable :: a_managed(:)
real, pinned, allocatable :: a_pinned(:)
end module

! CHECK-LABEL: fir.global @_QMglobalsEa_device {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xf32>>>
! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>

! CHECK-LABEL: fir.global @_QMglobalsEa_managed {data_attr = #cuf.cuda<managed>} : !fir.box<!fir.heap<!fir.array<?xf32>>>
! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 3 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>

! CHECK-LABEL: fir.global @_QMglobalsEa_pinned {data_attr = #cuf.cuda<pinned>} : !fir.box<!fir.heap<!fir.array<?xf32>>>
! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 1 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>

subroutine sub1()
real, allocatable, device :: a(:)
allocate(a(10))
Expand Down
Loading

0 comments on commit 54334d7

Please sign in to comment.