Skip to content

Commit

Permalink
Optimize CPU simulation
Browse files Browse the repository at this point in the history
  • Loading branch information
WrathfulSpatula committed Nov 2, 2024
1 parent e9fcdd0 commit ad12e08
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 132 deletions.
24 changes: 21 additions & 3 deletions include/common/qrack_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@

#include <set>
#include <vector>
#if CPP_STD >= 20
#include <bit>
#endif

#define _bi_div_mod(left, right, quotient, rmndr) \
if (quotient) { \
Expand Down Expand Up @@ -83,7 +86,9 @@ namespace Qrack {
inline bitLenInt log2Ocl(bitCapIntOcl n)
{
// Source: https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers#answer-11376759
#if ENABLE_INTRINSICS && defined(_WIN32) && !defined(__CYGWIN__)
#if CPP_STD >= 20
return std::bit_width(n) - 1U;
#elif ENABLE_INTRINSICS && defined(_WIN32) && !defined(__CYGWIN__)
#if UINTPOW < 6
return (bitLenInt)(bitsInByte * sizeof(unsigned int) - _lzcnt_u32((unsigned int)n) - 1U);
#else
Expand All @@ -95,8 +100,6 @@ inline bitLenInt log2Ocl(bitCapIntOcl n)
#else
return (bitLenInt)(bitsInByte * sizeof(unsigned long long) - __builtin_clzll((unsigned long long)n) - 1U);
#endif
#elif CPP_STD >= 20
return std::bit_width(n) - 1U;
#else
bitLenInt pow = 0U;
bitCapIntOcl p = n >> 1U;
Expand All @@ -108,6 +111,21 @@ inline bitLenInt log2Ocl(bitCapIntOcl n)
#endif
}

inline bitLenInt popCountOcl(bitCapIntOcl n)
{
#if CPP_STD >= 20
return std::popcount(n);
#elif defined(__GNUC__) || defined(__clang__)
return __builtin_popcount(n);
#else
bitCapIntOcl popCount;
for (popCount = 0U; n; ++popCount) {
n &= n - 1U;
}
return popCount;
#endif
}

#if (QBCAPPOW < 7) || ((QBCAPPOW < 8) && defined(__SIZEOF_INT128__))
inline int bi_log2(const bitCapInt& n) { return log2Ocl(n); }
#endif
Expand Down
3 changes: 0 additions & 3 deletions include/statevector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,8 @@ class StateVector : public ParallelFor {
bitCapIntOcl capacity;

public:
bool isReadLocked;

StateVector(bitCapIntOcl cap)
: capacity(cap)
, isReadLocked(true)
{
}
virtual ~StateVector()
Expand Down
5 changes: 4 additions & 1 deletion src/common/qengine.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ __device__ inline qCudaReal1 qCudaDot(qCudaReal4 a, qCudaReal4 b)
#if FPPOW > 4
__device__ inline qCudaCmplx polar_unit(const qCudaReal1 theta) { return make_qCudaCmplx(cos(theta), sin(theta)); }
#else
__device__ inline qCudaCmplx polar_unit(const qCudaReal1 theta) { return make_qCudaCmplx((qCudaReal1)cos((qCudaReal1_f)theta), (qCudaReal1)sin((qCudaReal1_f)theta)); }
__device__ inline qCudaCmplx polar_unit(const qCudaReal1 theta)
{
return make_qCudaCmplx((qCudaReal1)cos((qCudaReal1_f)theta), (qCudaReal1)sin((qCudaReal1_f)theta));
}
#endif

__device__ inline qCudaCmplx qCudaConj(qCudaCmplx a) { return make_qCudaCmplx(a.x, -a.y); }
Expand Down
17 changes: 0 additions & 17 deletions src/qengine/arithmetic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ void QEngineCPU::ROL(bitLenInt shift, bitLenInt start, bitLenInt length)
Finish();

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
stateVec->isReadLocked = false;

par_for(0, maxQPowerOcl, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -83,7 +82,6 @@ void QEngineCPU::INC(const bitCapInt& toAdd, bitLenInt inOutStart, bitLenInt len
Finish();

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
stateVec->isReadLocked = false;

par_for(0, maxQPowerOcl, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -138,7 +136,6 @@ void QEngineCPU::CINC(

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->copy(stateVec);
stateVec->isReadLocked = false;

par_for_mask(0, maxQPowerOcl, controlPowers, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -182,7 +179,6 @@ void QEngineCPU::INCDECC(const bitCapInt& toMod, bitLenInt inOutStart, bitLenInt

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

par_for_skip(0, maxQPowerOcl, pow2Ocl(carryIndex), 1U, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -235,7 +231,6 @@ void QEngineCPU::INCS(const bitCapInt& toAdd, bitLenInt inOutStart, bitLenInt le

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

par_for(0, maxQPowerOcl, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -286,7 +281,6 @@ void QEngineCPU::INCDECSC(const bitCapInt& toMod, bitLenInt inOutStart, bitLenIn

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

par_for_skip(0, maxQPowerOcl, carryMask, 1U, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -344,7 +338,6 @@ void QEngineCPU::INCDECSC(

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

par_for_skip(0, maxQPowerOcl, carryMask, 1U, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -388,7 +381,6 @@ void QEngineCPU::MULDIV(const IOFn& inFn, const IOFn& outFn, const bitCapInt& to

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

par_for_skip(0, maxQPowerOcl, pow2Ocl(carryStart), length, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -470,7 +462,6 @@ void QEngineCPU::CMULDIV(const IOFn& inFn, const IOFn& outFn, const bitCapInt& t

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

par_for_mask(0, maxQPowerOcl, skipPowers, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -563,7 +554,6 @@ void QEngineCPU::ModNOut(const MFn& kernelFn, const bitCapInt& modN, const bitLe

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

par_for_skip(0, maxQPowerOcl, pow2Ocl(outStart), length, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -656,7 +646,6 @@ void QEngineCPU::CModNOut(const MFn& kernelFn, const bitCapInt& modN, const bitL

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

par_for_mask(0, maxQPowerOcl, skipPowers, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -758,7 +747,6 @@ void QEngineCPU::INCBCD(const bitCapInt& toAdd, bitLenInt inOutStart, bitLenInt

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

par_for(0, maxQPowerOcl, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -833,7 +821,6 @@ void QEngineCPU::INCDECBCDC(const bitCapInt& toMod, bitLenInt inOutStart, bitLen

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

par_for_skip(0, maxQPowerOcl, pow2Ocl(carryIndex), ONE_BCI, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl otherRes = lcv & otherMask;
Expand Down Expand Up @@ -919,7 +906,6 @@ bitCapInt QEngineCPU::IndexedLDA(bitLenInt indexStart, bitLenInt indexLength, bi

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

ParallelFunc fn;
if (valueBytes == 1) {
Expand Down Expand Up @@ -1004,7 +990,6 @@ bitCapInt QEngineCPU::IndexedADC(bitLenInt indexStart, bitLenInt indexLength, bi
// We calloc a new stateVector for output.
StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

// We're going to loop over every eigenstate in the vector, (except, we
// already know the carry is zero). This bit masks let us quickly
Expand Down Expand Up @@ -1121,7 +1106,6 @@ bitCapInt QEngineCPU::IndexedSBC(bitLenInt indexStart, bitLenInt indexLength, bi
// We calloc a new stateVector for output.
StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

// We're going to loop over every eigenstate in the vector, (except, we already know the carry is zero).
// This bit masks let us quickly distinguish the different values of the input register, output register, carry, and
Expand Down Expand Up @@ -1214,7 +1198,6 @@ void QEngineCPU::Hash(bitLenInt start, bitLenInt length, const unsigned char* va

StateVectorPtr nStateVec = AllocStateVec(maxQPowerOcl);
nStateVec->clear();
stateVec->isReadLocked = false;

par_for(0, maxQPowerOcl, [&](const bitCapIntOcl& lcv, const unsigned& cpu) {
const bitCapIntOcl inputRes = lcv & inputMask;
Expand Down
Loading

0 comments on commit ad12e08

Please sign in to comment.