Skip to content

Commit

Permalink
Merge branch 'upstream-master'
Browse files Browse the repository at this point in the history
  • Loading branch information
Datadog Syncup Service committed Aug 28, 2024
2 parents 44069fb + 32c9750 commit e577398
Show file tree
Hide file tree
Showing 32 changed files with 871 additions and 195 deletions.
2 changes: 2 additions & 0 deletions src/hotspot/cpu/riscv/assembler_riscv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1835,8 +1835,10 @@ enum Nf {

// Vector Unit-Stride Segment Load Instructions
INSN(vlseg3e8_v, 0b0000111, 0b000, 0b00000, 0b00, 0b0, g3);
INSN(vlseg4e8_v, 0b0000111, 0b000, 0b00000, 0b00, 0b0, g4);

// Vector Unit-Stride Segment Store Instructions
INSN(vsseg3e8_v, 0b0100111, 0b000, 0b00000, 0b00, 0b0, g3);
INSN(vsseg4e8_v, 0b0100111, 0b000, 0b00000, 0b00, 0b0, g4);

#undef INSN
Expand Down
274 changes: 274 additions & 0 deletions src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5322,6 +5322,279 @@ class StubGenerator: public StubCodeGenerator {
return (address) start;
}

/**
* vector registers:
* input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
* index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
* output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
*
* NOTE: each field will occupy a single vector register group
*/
void base64_vector_decode_round(Register src, Register dst, Register codec,
Register size, Register stepSrc, Register stepDst, Register failedIdx, Register minusOne,
VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
Assembler::LMUL lmul) {
// set vector register type/len
__ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);

// segmented load src into v registers: mem(src) => vr(4)
__ vlseg4e8_v(inputV1, src);

// src = src + register_group_len_bytes * 4
__ add(src, src, stepSrc);

// decoding
// 1. indexed load: vr(4) => vr(4)
__ vluxei8_v(idxV1, codec, inputV1);
__ vluxei8_v(idxV2, codec, inputV2);
__ vluxei8_v(idxV3, codec, inputV3);
__ vluxei8_v(idxV4, codec, inputV4);

// 2. check wrong data
__ vor_vv(outputV1, idxV1, idxV2);
__ vor_vv(outputV2, idxV3, idxV4);
__ vor_vv(outputV1, outputV1, outputV2);
__ vmseq_vi(v0, outputV1, -1);
__ vfirst_m(failedIdx, v0);
Label NoFailure;
__ beq(failedIdx, minusOne, NoFailure);
__ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
__ slli(stepDst, failedIdx, 1);
__ add(stepDst, failedIdx, stepDst);
__ BIND(NoFailure);

// 3. compute the decoded data: vr(4) => vr(3)
__ vsll_vi(idxV1, idxV1, 2);
__ vsrl_vi(outputV1, idxV2, 4);
__ vor_vv(outputV1, outputV1, idxV1);

__ vsll_vi(idxV2, idxV2, 4);
__ vsrl_vi(outputV2, idxV3, 2);
__ vor_vv(outputV2, outputV2, idxV2);

__ vsll_vi(idxV3, idxV3, 6);
__ vor_vv(outputV3, idxV4, idxV3);

// segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
__ vsseg3e8_v(outputV1, dst);

// dst = dst + register_group_len_bytes * 3
__ add(dst, dst, stepDst);
}

/**
* int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
*
* Input arguments:
* c_rarg0 - src, source array
* c_rarg1 - sp, src start offset
* c_rarg2 - sl, src end offset
* c_rarg3 - dst, dest array
* c_rarg4 - dp, dst start offset
* c_rarg5 - isURL, Base64 or URL character set
* c_rarg6 - isMIME, Decoding MIME block
*/
address generate_base64_decodeBlock() {

static const uint8_t fromBase64[256] = {
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
};

static const uint8_t fromBase64URL[256] = {
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
};

__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "decodeBlock");
address start = __ pc();
__ enter();

Register src = c_rarg0;
Register soff = c_rarg1;
Register send = c_rarg2;
Register dst = c_rarg3;
Register doff = c_rarg4;
Register isURL = c_rarg5;
Register isMIME = c_rarg6;

Register codec = c_rarg7;
Register dstBackup = x31;
Register length = x28; // t3, total length of src data in bytes

Label ProcessData, Exit;
Label ProcessScalar, ScalarLoop;

// passed in length (send - soff) is guaranteed to be > 4,
// and in this intrinsic we only process data of length in multiple of 4,
// it's not guaranteed to be multiple of 4 by java level, so do it explicitly
__ sub(length, send, soff);
__ andi(length, length, -4);
// real src/dst to process data
__ add(src, src, soff);
__ add(dst, dst, doff);
// backup of dst, used to calculate the return value at exit
__ mv(dstBackup, dst);

// load the codec base address
__ la(codec, ExternalAddress((address) fromBase64));
__ beqz(isURL, ProcessData);
__ la(codec, ExternalAddress((address) fromBase64URL));
__ BIND(ProcessData);

// vector version
if (UseRVV) {
// for MIME case, it has a default length limit of 76 which could be
// different(smaller) from (send - soff), so in MIME case, we go through
// the scalar code path directly.
__ bnez(isMIME, ScalarLoop);

Label ProcessM1, ProcessM2;

Register failedIdx = soff;
Register stepSrcM1 = send;
Register stepSrcM2 = doff;
Register stepDst = isURL;
Register size = x29; // t4
Register minusOne = x30; // t5

__ mv(minusOne, -1);
__ mv(size, MaxVectorSize * 2);
__ mv(stepSrcM1, MaxVectorSize * 4);
__ slli(stepSrcM2, stepSrcM1, 1);
__ mv(stepDst, MaxVectorSize * 2 * 3);

__ blt(length, stepSrcM2, ProcessM1);


// Assembler::m2
__ BIND(ProcessM2);
base64_vector_decode_round(src, dst, codec,
size, stepSrcM2, stepDst, failedIdx, minusOne,
v2, v4, v6, v8, // inputs
v10, v12, v14, v16, // indexes
v18, v20, v22, // outputs
Assembler::m2);
__ sub(length, length, stepSrcM2);

// error check
__ bne(failedIdx, minusOne, Exit);

__ bge(length, stepSrcM2, ProcessM2);


// Assembler::m1
__ BIND(ProcessM1);
__ blt(length, stepSrcM1, ProcessScalar);

__ srli(size, size, 1);
__ srli(stepDst, stepDst, 1);
base64_vector_decode_round(src, dst, codec,
size, stepSrcM1, stepDst, failedIdx, minusOne,
v1, v2, v3, v4, // inputs
v5, v6, v7, v8, // indexes
v9, v10, v11, // outputs
Assembler::m1);
__ sub(length, length, stepSrcM1);

// error check
__ bne(failedIdx, minusOne, Exit);

__ BIND(ProcessScalar);
__ beqz(length, Exit);
}

// scalar version
{
Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
Register combined32Bits = x29; // t5

// encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
// plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
__ BIND(ScalarLoop);

// load 4 bytes encoded src data
__ lbu(byte0, Address(src, 0));
__ lbu(byte1, Address(src, 1));
__ lbu(byte2, Address(src, 2));
__ lbu(byte3, Address(src, 3));
__ addi(src, src, 4);

// get codec index and decode (ie. load from codec by index)
__ add(byte0, codec, byte0);
__ add(byte1, codec, byte1);
__ lb(byte0, Address(byte0, 0));
__ lb(byte1, Address(byte1, 0));
__ add(byte2, codec, byte2);
__ add(byte3, codec, byte3);
__ lb(byte2, Address(byte2, 0));
__ lb(byte3, Address(byte3, 0));
__ slliw(byte0, byte0, 18);
__ slliw(byte1, byte1, 12);
__ orr(byte0, byte0, byte1);
__ orr(byte0, byte0, byte3);
__ slliw(byte2, byte2, 6);
// For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
// 1. error check below
// 2. decode below
__ orr(combined32Bits, byte0, byte2);

// error check
__ bltz(combined32Bits, Exit);

// store 3 bytes decoded data
__ sraiw(byte0, combined32Bits, 16);
__ sraiw(byte1, combined32Bits, 8);
__ sb(byte0, Address(dst, 0));
__ sb(byte1, Address(dst, 1));
__ sb(combined32Bits, Address(dst, 2));

__ sub(length, length, 4);
__ addi(dst, dst, 3);
// loop back
__ bnez(length, ScalarLoop);
}

__ BIND(Exit);
__ sub(c_rarg0, dst, dstBackup);

__ leave();
__ ret();

return (address) start;
}

void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
Register temp0, Register temp1, Register temp2, Register temp3,
Expand Down Expand Up @@ -5980,6 +6253,7 @@ static const int64_t right_3_bits = right_n_bits(3);

if (UseBASE64Intrinsics) {
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
}

if (UseAdler32Intrinsics) {
Expand Down
59 changes: 43 additions & 16 deletions src/hotspot/share/gc/shared/gcLocker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,33 @@
#include "runtime/javaThread.inline.hpp"
#include "runtime/safepoint.hpp"
#include "runtime/threadSMR.hpp"
#include "utilities/ticks.hpp"

volatile jint GCLocker::_jni_lock_count = 0;
volatile bool GCLocker::_needs_gc = false;
unsigned int GCLocker::_total_collections = 0;

// GCLockerTimingDebugLogger tracks specific timing information for GC lock waits.
class GCLockerTimingDebugLogger : public StackObj {
const char* _log_message;
Ticks _start;

public:
GCLockerTimingDebugLogger(const char* log_message) : _log_message(log_message) {
assert(_log_message != nullptr, "GC locker debug message must be set.");
_start = Ticks::now();
}

~GCLockerTimingDebugLogger() {
Log(gc, jni) log;
if (log.is_debug()) {
ResourceMark rm; // JavaThread::name() allocates to convert to UTF8
const Tickspan elapsed_time = Ticks::now() - _start;
log.debug("%s Resumed after " UINT64_FORMAT "ms. Thread \"%s\".", _log_message, elapsed_time.milliseconds(), Thread::current()->name());
}
}
};

#ifdef ASSERT
volatile jint GCLocker::_debug_jni_lock_count = 0;
#endif
Expand Down Expand Up @@ -110,11 +132,11 @@ void GCLocker::stall_until_clear() {
if (needs_gc()) {
GCLockerTracer::inc_stall_count();
log_debug_jni("Allocation failed. Thread stalled by JNI critical section.");
}

// Wait for _needs_gc to be cleared
while (needs_gc()) {
ml.wait();
GCLockerTimingDebugLogger logger("Thread stalled by JNI critical section.");
// Wait for _needs_gc to be cleared
while (needs_gc()) {
ml.wait();
}
}
}

Expand All @@ -127,16 +149,20 @@ void GCLocker::jni_lock(JavaThread* thread) {
assert(!thread->in_critical(), "shouldn't currently be in a critical region");
MonitorLocker ml(JNICritical_lock);
// Block entering threads if there's a pending GC request.
while (needs_gc()) {
// There's at least one thread that has not left the critical region (CR)
// completely. When that last thread (no new threads can enter CR due to the
// blocking) exits CR, it calls `jni_unlock`, which sets `_needs_gc`
// to false and wakes up all blocked threads.
// We would like to assert #threads in CR to be > 0, `_jni_lock_count > 0`
// in the code, but it's too strong; it's possible that the last thread
// has called `jni_unlock`, but not yet finished the call, e.g. initiating
// a GCCause::_gc_locker GC.
ml.wait();
if (needs_gc()) {
log_debug_jni("Blocking thread as there is a pending GC request");
GCLockerTimingDebugLogger logger("Thread blocked to enter critical region.");
while (needs_gc()) {
// There's at least one thread that has not left the critical region (CR)
// completely. When that last thread (no new threads can enter CR due to the
// blocking) exits CR, it calls `jni_unlock`, which sets `_needs_gc`
// to false and wakes up all blocked threads.
// We would like to assert #threads in CR to be > 0, `_jni_lock_count > 0`
// in the code, but it's too strong; it's possible that the last thread
// has called `jni_unlock`, but not yet finished the call, e.g. initiating
// a GCCause::_gc_locker GC.
ml.wait();
}
}
thread->enter_critical();
_jni_lock_count++;
Expand All @@ -148,6 +174,7 @@ void GCLocker::jni_unlock(JavaThread* thread) {
MutexLocker mu(JNICritical_lock);
_jni_lock_count--;
decrement_debug_jni_lock_count();
log_debug_jni("Thread exiting critical region.");
thread->exit_critical();
if (needs_gc() && !is_active_internal()) {
// We're the last thread out. Request a GC.
Expand All @@ -161,7 +188,7 @@ void GCLocker::jni_unlock(JavaThread* thread) {
{
// Must give up the lock while at a safepoint
MutexUnlocker munlock(JNICritical_lock);
log_debug_jni("Performing GC after exiting critical section.");
log_debug_jni("Last thread exiting. Performing GC after exiting critical section.");
Universe::heap()->collect(GCCause::_gc_locker);
}
_needs_gc = false;
Expand Down
Loading

0 comments on commit e577398

Please sign in to comment.