[WebAssembly] Implement prototype f32.store_f16 instruction. (llvm#91545

) Adds a builtin and intrinsic for the f32.store_f16 instruction. The instruction stores an f32 value as an f16 memory. Specified at: https://github.com/WebAssembly/half-precision/blob/29a9b9462c9285d4ccc1a5dc39214ddfd1892658/proposals/half-precision/Overview.md Note: the current spec has f32.store_f16 as opcode 0xFD0121, but this is incorrect and will be changed to 0xFC31 soon.
ROCm · May 9, 2024 · 8a3277a · 8a3277a
1 parent a99cb96
commit 8a3277a
Show file tree

Hide file tree

Showing 10 changed files with 75 additions and 2 deletions.
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -192,6 +192,7 @@ TARGET_BUILTIN(__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4, "V4fV8UsV8UsV4f"
 
 // Half-Precision (fp16)
 TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fh*", "nU", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_storef16_f32, "vfh*", "n", "half-precision")
 
 // Reference Types builtins
 // Some builtins are custom type-checked - see 't' as part of the third argument,

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -21310,6 +21310,12 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_loadf16_f32);
     return Builder.CreateCall(Callee, {Addr});
   }
+  case WebAssembly::BI__builtin_wasm_storef16_f32: {
+    Value *Val = EmitScalarExpr(E->getArg(0));
+    Value *Addr = EmitScalarExpr(E->getArg(1));
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_storef16_f32);
+    return Builder.CreateCall(Callee, {Val, Addr});
+  }
   case WebAssembly::BI__builtin_wasm_table_get: {
     assert(E->getArg(0)->getType()->isArrayType());
     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).emitRawPointer(*this);

diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
@@ -807,6 +807,12 @@ float load_f16_f32(__fp16 *addr) {
   // WEBASSEMBLY: call float @llvm.wasm.loadf16.f32(ptr %{{.*}})
 }
 
+void store_f16_f32(float val, __fp16 *addr) {
+  return __builtin_wasm_storef16_f32(val, addr);
+  // WEBASSEMBLY: tail call void @llvm.wasm.storef16.f32(float %val, ptr %{{.*}})
+  // WEBASSEMBLY-NEXT: ret
+}
+
 __externref_t externref_null() {
   return __builtin_wasm_ref_null_extern();
   // WEBASSEMBLY: tail call ptr addrspace(10) @llvm.wasm.ref.null.extern()

diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -332,6 +332,11 @@ def int_wasm_loadf16_f32:
             [llvm_ptr_ty],
             [IntrReadMem, IntrArgMemOnly],
              "", [SDNPMemOperand]>;
+def int_wasm_storef16_f32:
+  Intrinsic<[],
+            [llvm_float_ty, llvm_ptr_ty],
+            [IntrWriteMem, IntrArgMemOnly],
+             "", [SDNPMemOperand]>;
 
 
 //===----------------------------------------------------------------------===//

diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -207,6 +207,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
   WASM_LOAD_STORE(LOAD_LANE_I16x8)
   WASM_LOAD_STORE(STORE_LANE_I16x8)
   WASM_LOAD_STORE(LOAD_F16_F32)
+  WASM_LOAD_STORE(STORE_F16_F32)
   return 1;
   WASM_LOAD_STORE(LOAD_I32)
   WASM_LOAD_STORE(LOAD_F32)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -914,6 +914,14 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.align = Align(2);
     Info.flags = MachineMemOperand::MOLoad;
     return true;
+  case Intrinsic::wasm_storef16_f32:
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = MVT::f16;
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = Align(2);
+    Info.flags = MachineMemOperand::MOStore;
+    return true;
   default:
     return false;
   }

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -72,8 +72,9 @@ defm LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x33, []>;
 defm LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34, []>;
 defm LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35, []>;
 
-// Half Precision
-defm LOAD_F16_F32 : WebAssemblyLoad<F32, "f32.load_f16", 0xfc30, [HasHalfPrecision]>;
+// Half-precision load.
+defm LOAD_F16_F32 :
+  WebAssemblyLoad<F32, "f32.load_f16", 0xfc30, [HasHalfPrecision]>;
 
 // Pattern matching
 
@@ -171,12 +172,18 @@ defm STORE8_I64 : WebAssemblyStore<I64, "i64.store8", 0x3c>;
 defm STORE16_I64 : WebAssemblyStore<I64, "i64.store16", 0x3d>;
 defm STORE32_I64 : WebAssemblyStore<I64, "i64.store32", 0x3e>;
 
+// Half-precision store.
+defm STORE_F16_F32 :
+  WebAssemblyStore<F32, "f32.store_f16", 0xfc31, [HasHalfPrecision]>;
+
 defm : StorePat<i32, truncstorei8, "STORE8_I32">;
 defm : StorePat<i32, truncstorei16, "STORE16_I32">;
 defm : StorePat<i64, truncstorei8, "STORE8_I64">;
 defm : StorePat<i64, truncstorei16, "STORE16_I64">;
 defm : StorePat<i64, truncstorei32, "STORE32_I64">;
 
+defm : StorePat<f32, int_wasm_storef16_f32, "STORE_F16_F32">;
+
 multiclass MemoryOps<WebAssemblyRegClass rc, string B> {
 // Current memory size.
 defm MEMORY_SIZE_A#B : I<(outs rc:$dst), (ins i32imm:$flags),

diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision | FileCheck %s
 
 declare float @llvm.wasm.loadf32.f16(ptr)
+declare void @llvm.wasm.storef16.f32(float, ptr)
 
 ; CHECK-LABEL: ldf16_32:
 ; CHECK:      f32.load_f16 $push[[NUM0:[0-9]+]]=, 0($0){{$}}
@@ -10,3 +11,11 @@ define float @ldf16_32(ptr %p) {
   %v = call float @llvm.wasm.loadf16.f32(ptr %p)
   ret float %v
 }
+
+; CHECK-LABEL: stf16_32:
+; CHECK:       f32.store_f16 0($1), $0
+; CHECK-NEXT:  return
+define void @stf16_32(float %v, ptr %p) {
+  tail call void @llvm.wasm.storef16.f32(float %v, ptr %p)
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/offset.ll b/llvm/test/CodeGen/WebAssembly/offset.ll
@@ -692,3 +692,30 @@ define float @load_f16_f32_with_folded_gep_offset(ptr %p) {
   %t = call float @llvm.wasm.loadf16.f32(ptr %s)
   ret float %t
 }
+
+;===----------------------------------------------------------------------------
+; Stores: Half Precision
+;===----------------------------------------------------------------------------
+
+; Basic store.
+
+; CHECK-LABEL: store_f16_f32_no_offset:
+; CHECK-NEXT: .functype store_f16_f32_no_offset (i32, f32) -> (){{$}}
+; CHECK-NEXT: f32.store_f16 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_f16_f32_no_offset(ptr %p, float %v) {
+  call void @llvm.wasm.storef16.f32(float %v, ptr %p)
+  ret void
+}
+
+; Storing to a fixed address.
+
+; CHECK-LABEL: store_f16_f32_to_numeric_address:
+; CHECK:      i32.const $push1=, 0{{$}}
+; CHECK-NEXT: f32.const $push0=, 0x0p0{{$}}
+; CHECK-NEXT: f32.store_f16 42($pop1), $pop0{{$}}
+define void @store_f16_f32_to_numeric_address() {
+  %s = inttoptr i32 42 to ptr
+  call void @llvm.wasm.storef16.f32(float 0.0, ptr %s)
+  ret void
+}
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -842,4 +842,7 @@ main:
     # CHECK: f32.load_f16 48 # encoding: [0xfc,0x30,0x01,0x30]
     f32.load_f16 48
 
+    # CHECK: f32.store_f16 32 # encoding: [0xfc,0x31,0x01,0x20]
+    f32.store_f16 32
+
     end_function