From 14ee2c9b07f5b443d2bd94415606ffde8e435bb2 Mon Sep 17 00:00:00 2001 From: Stefan Krastanov Date: Wed, 5 Jul 2023 02:19:29 -0400 Subject: [PATCH] Memory layout optimizations and removing Polyester (#143) --- CHANGELOG.md | 5 ++++ Project.toml | 4 +--- benchmark/benchmarks.jl | 1 + src/QuantumClifford.jl | 12 +++------- src/dense_cliffords.jl | 6 +++-- src/fastmemlayout.jl | 36 ++++++++++++++++++++++++++++ src/mctrajectory.jl | 2 +- src/mul_leftright.jl | 49 +++++++++++++++++++++++++++++++++++---- src/pauli_frames.jl | 26 ++++++++------------- src/symbolic_cliffords.jl | 7 +++--- test/Project.toml | 1 - test/runtests.jl | 1 + test/test_jet.jl | 2 -- test/test_memorylayout.jl | 23 ++++++++++++++++++ 14 files changed, 134 insertions(+), 41 deletions(-) create mode 100644 src/fastmemlayout.jl create mode 100644 test/test_memorylayout.jl diff --git a/CHANGELOG.md b/CHANGELOG.md index 055c38e9a..68c18278b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ # News +## v0.8.10 - 2023-07-05 + +- Remove Polyester.jl multithreading, leading to simpler and better compiled single-thread code. Now single-thread performance is much higher. Multithreading was generally not useful at the inner loops where it was deployed. +- Implement `fastcolumn` and `fastrow`, which transform a tableau into a memory layout optimized for column or row operations. + ## v0.8.9 - 2023-07-04 - In the unexported experimental ECC module: diff --git a/Project.toml b/Project.toml index 8766adfdd..d13db2220 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "QuantumClifford" uuid = "0525e862-1e90-11e9-3e4d-1b39d7109de1" authors = ["Stefan Krastanov "] -version = "0.8.9" +version = "0.8.10" [deps] Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" @@ -13,7 +13,6 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" Nemo = "2edaba10-b0f1-5616-af89-8c11ac63239a" -Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" QuantumInterface = "5717a53b-5d69-4fa3-b976-0bf2f97ca1e5" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -40,7 +39,6 @@ MacroTools = "0.5.9" Makie = "0.19" Nemo = "0.34" Plots = "1.38.0" -Polyester = "0.7" PrecompileTools = "1" Quantikz = "1.2" QuantumInterface = "0.3.0" diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 8f8bba86a..8b31e501f 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -133,6 +133,7 @@ SUITE["circuitsim"]["pftrajectories_sumtype"]["q101_r1"] = @benchmarkable p SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r1"] = @benchmarkable pftrajectories(state,circuit) setup=(state=PauliFrame( 1, 1001, 1); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1 SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r100"] = @benchmarkable pftrajectories(state,circuit) setup=(state=PauliFrame( 100, 1001, 1); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1 SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r10000"] = @benchmarkable pftrajectories(state,circuit) setup=(state=PauliFrame(1000, 1001, 1); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1 +SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r10000_fastrow"] = @benchmarkable pftrajectories(state,circuit) setup=(state=fastrow(PauliFrame(1000, 1001, 1)); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1 SUITE["circuitsim"]["mctrajectories_sumtype"] = BenchmarkGroup(["mctrajectories_sumtype"]) SUITE["circuitsim"]["mctrajectories_sumtype"]["q101_r1"] = @benchmarkable mctrajectory!(state, circuit) setup=(state=Register(one(Stabilizer, 101), [false]); circuit=compactify_circuit(x_diag_circuit( 100))) evals=1 SUITE["circuitsim"]["mctrajectories_sumtype"]["q1001_r1"] = @benchmarkable mctrajectory!(state, circuit) setup=(state=Register(one(Stabilizer, 1001), [false]); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1 diff --git a/src/QuantumClifford.jl b/src/QuantumClifford.jl index 70b4a4df2..34d95f4d4 100644 --- a/src/QuantumClifford.jl +++ b/src/QuantumClifford.jl @@ -8,12 +8,8 @@ module QuantumClifford # TODO Significant performance improvements: many operations do not need phase=true if the Pauli operations commute import LinearAlgebra -using LinearAlgebra: inv, mul!, rank +using LinearAlgebra: inv, mul!, rank, Adjoint using DocStringExtensions -using Polyester -#using LoopVectorization -using HostCPUFeatures: pick_vector_width -import SIMD import QuantumInterface: tensor, ⊗, tensor_pow, apply!, nqubits, expect, project!, reset_qubits!, traceout!, ptrace, apply!, projectX!, projectY!, projectZ!, entanglement_entropy @@ -25,6 +21,7 @@ export prodphase, comm, nqubits, stabilizerview, destabilizerview, logicalxview, logicalzview, phases, + fastcolumn, fastrow, bitview, quantumstate, tab, BadDataStructure, affectedqubits, #TODO move to QuantumInterface? @@ -91,10 +88,6 @@ function __init__() BIG_INT_FOUR[] = BigInt(4) end -const MINBATCH1Q = 100 -const MINBATCH2Q = 100 -const MINBATCHDENSE = 25 - const NoZeroQubit = ArgumentError("Qubit indices have to be larger than zero, but you attempting are creating a gate acting on a qubit with a non-positive index. Ensure indexing always starts from 1.") # Predefined constants representing the permitted phases encoded @@ -1276,6 +1269,7 @@ include("experimental/Experimental.jl") include("graphs.jl") include("entanglement.jl") include("tableau_show.jl") +include("fastmemlayout.jl") include("sumtypes.jl") include("precompiles.jl") include("ecc/ECC.jl") diff --git a/src/dense_cliffords.jl b/src/dense_cliffords.jl index a67bd2013..32dd6e6b5 100644 --- a/src/dense_cliffords.jl +++ b/src/dense_cliffords.jl @@ -127,7 +127,8 @@ function _apply!(stab::AbstractStabilizer, c::CliffordOperator; phases::Val{B}=V nqubits(stab)==nqubits(c) || throw(DimensionMismatch("The tableau and the Clifford operator need to act on the same number of qubits. Consider specifying an array of indices as a third argument to the `apply!` function to avoid this error.")) s_tab = tab(stab) c_tab = tab(c) - @batch minbatch=MINBATCHDENSE threadlocal=zero(c_tab[1]) for row_stab in eachindex(s_tab) + threadlocal=zero(c_tab[1]) + @inbounds @simd for row_stab in eachindex(s_tab) zero!(threadlocal) # a new stabrow for temporary storage apply_row_kernel!(threadlocal, row_stab, s_tab, c_tab, phases=phases) end @@ -172,7 +173,8 @@ function _apply!(stab::AbstractStabilizer, c::CliffordOperator, indices_of_appli #max(indices_of_application)<=nqubits(s) || throw(DimensionMismatch("")) # Too expensive to check every time s_tab = tab(stab) c_tab = tab(c) - @batch minbatch=25 threadlocal=zero(c_tab[1]) for row_stab in eachindex(s_tab) + threadlocal=zero(c_tab[1]) + @inbounds @simd for row_stab in eachindex(s_tab) zero!(threadlocal) # a new stabrow for temporary storage apply_row_kernel!(threadlocal, row_stab, s_tab, c_tab, indices_of_application, phases=phases) end diff --git a/src/fastmemlayout.jl b/src/fastmemlayout.jl new file mode 100644 index 000000000..b9d5eb461 --- /dev/null +++ b/src/fastmemlayout.jl @@ -0,0 +1,36 @@ +"""Convert a tableau to a memory layout that is fast for row operations. + +In this layout a Pauli string (a row of the tableau) is stored contiguously in memory. + +See also: [`fastrow`](@ref)""" +function fastrow end + +"""Convert a tableau to a memory layout that is fast for column operations. + +In this layout a column of the tableau is stored (mostly) contiguously in memory. +Due to bitpacking, e.g., packing 64 bits into a single `UInt64`, +the memory layout is not perfectly contiguous, +but it is still optimal given that some bitwrangling is required to extract a given bit. + +See also: [`fastrow`](@ref)""" +function fastcolumn end + +fastrow(t::Tableau{Tzv,Tm}) where {Tzv, Tm} = t +fastrow(t::Tableau{Tzv,Tm}) where {Tzv, Tm<:Adjoint} = Tableau(t.phases, t.nqubits, collect(t.xzs)) +fastcolumn(t::Tableau{Tzv,Tm}) where {Tzv, Tm} = Tableau(t.phases, t.nqubits, collect(t.xzs')') +fastcolumn(t::Tableau{Tzv,Tm}) where {Tzv, Tm<:Adjoint} = t + +fastrow(s::Stabilizer) = Stabilizer(fastrow(s.tab)) +fastcolumn(s::Stabilizer) = Stabilizer(fastcolumn(s.tab)) + +fastrow(s::Destabilizer) = Destabilizer(fastrow(s.tab)) +fastcolumn(s::Destabilizer) = Destabilizer(fastcolumn(s.tab)) + +fastrow(s::MixedStabilizer) = MixedStabilizer(fastrow(s.tab), s.rank) +fastcolumn(s::MixedStabilizer) = MixedStabilizer(fastcolumn(s.tab), s.rank) + +fastrow(s::MixedDestabilizer) = MixedDestabilizer(fastrow(s.tab), s.rank) +fastcolumn(s::MixedDestabilizer) = MixedDestabilizer(fastcolumn(s.tab), s.rank) + +fastrow(s::PauliFrame) = PauliFrame(fastrow(s.frame), s.measurements) +fastcolumn(s::PauliFrame) = PauliFrame(fastcolumn(s.frame), s.measurements) diff --git a/src/mctrajectory.jl b/src/mctrajectory.jl index c6bf74404..c514ebdd8 100644 --- a/src/mctrajectory.jl +++ b/src/mctrajectory.jl @@ -60,6 +60,6 @@ end See also: [`pftrajectories`](@ref), `petrajectories`""" function mctrajectories(initialstate,circuit;trajectories=500) - counts = countmap([mctrajectory!(copy(initialstate),circuit)[2] for i in 1:trajectories]) # TODO use threads or at least a generator, but without breaking Polyester + counts = countmap([mctrajectory!(copy(initialstate),circuit)[2] for i in 1:trajectories]) # TODO use threads or at least a generator return counts end diff --git a/src/mul_leftright.jl b/src/mul_leftright.jl index f21ab427f..2d56ea741 100644 --- a/src/mul_leftright.jl +++ b/src/mul_leftright.jl @@ -1,8 +1,17 @@ +# using LoopVectorization +using HostCPUFeatures: pick_vector_width +import SIMD + """Nonvectorized version of `mul_left!` used for unit tests.""" function _mul_left_nonvec!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Bool=true) where T<:Unsigned + rcnt1, rcnt2 = _mul_ordered_nonvec!(r,l; phases) + rcnt1 ⊻ (rcnt2 << 1) +end + +function _mul_ordered_nonvec!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Bool=true) where T<:Unsigned if !phases r .⊻= l - return zero(T) + return (0, 0) end cnt1 = zero(T) cnt2 = zero(T) @@ -16,9 +25,41 @@ function _mul_left_nonvec!(r::AbstractVector{T}, l::AbstractVector{T}; phases::B cnt2 ⊻= (cnt1 ⊻ newx1 ⊻ newz1 ⊻ x1z2) & anti_comm cnt1 ⊻= anti_comm end - s = count_ones(cnt1) - s ⊻= count_ones(cnt2) << 1 - s + rcnt1 = count_ones(cnt1) + rcnt2 = count_ones(cnt2) + rcnt1, rcnt2 +end + +#= # LoopVectorization does not support the necessary bit operations +function mul_ordered_lv!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Val{B}=Val(true)) where {T<:Unsigned, B} + if !B + r .⊻= l + return (0, 0) + end + cnt1 = zero(T) + cnt2 = zero(T) + len = length(l)÷2 + @turbo for i in 1:len + x1, x2, z1, z2 = l[i], r[i], l[i+len], r[i+len] + newx1 = x1 ⊻ x2 + r[i] = newx1 + newz1 = z1 ⊻ z2 + r[i+len] = newz1 + x1z2 = x1 & z2 + anti_comm = (x2 & z1) ⊻ x1z2 + cnt2 ⊻= (newx1 ⊻ newz1 ⊻ x1z2) & anti_comm + cnt1 ⊻= anti_comm + end + rcnt1 = count_ones(cnt1) + rcnt2 = count_ones(cnt2) + rcnt1, rcnt2 +end +=# + +function mul_ordered!(r::SubArray{T,1,P,I1,L1}, l::SubArray{T,1,P,I2,L2}; phases::Val{B}=Val(true)) where {T<:Unsigned, B, I1, I2, L1, L2, P<:Adjoint} + # This method exists because SIMD.jl does not play well with Adjoint + # Delete it and try `QuantumClifford.mul_left!(fastcolumn(random_stabilizer(194)), 2, 1)` # works fine for 192 + _mul_ordered_nonvec!(r,l; phases=B) end function mul_ordered!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Val{B}=Val(true)) where {T<:Unsigned, B} diff --git a/src/pauli_frames.jl b/src/pauli_frames.jl index fa6c96f7d..e7bc17295 100644 --- a/src/pauli_frames.jl +++ b/src/pauli_frames.jl @@ -23,8 +23,9 @@ $(TYPEDSIGNATURES) Prepare an empty set of Pauli frames with the given number of `frames` and `qubits`. Preallocates spaces for `measurement` number of measurements. """ function PauliFrame(frames, qubits, measurements) - stab = zero(Stabilizer, frames, qubits) # TODO this should really be a Tableau - frame = PauliFrame(stab, zeros(Bool, frames, measurements)) + stab = fastcolumn(zero(Stabilizer, frames, qubits)) # TODO this should really be a Tableau + bits = zeros(Bool, frames, measurements) + frame = PauliFrame(stab, bits) initZ!(frame) return frame end @@ -40,7 +41,7 @@ It is done automatically by most [`PauliFrame`](@ref) constructors. function initZ!(frame::PauliFrame) T = eltype(frame.frame.tab.xzs) - @inbounds @simd for f in eachindex(frame) # TODO thread this + @inbounds @simd for f in eachindex(frame) @simd for row in 1:size(frame.frame.tab.xzs,1)÷2 frame.frame.tab.xzs[end÷2+row,f] = rand(T) end @@ -63,7 +64,7 @@ function apply!(frame::PauliFrame, op::sMZ) # TODO sMX, sMY ismall = _mod(T,i-1) ismallm = lowbit<<(ismall) - @inbounds @simd for f in eachindex(frame) # TODO thread this + @inbounds @simd for f in eachindex(frame) should_flip = !iszero(xzs[ibig,f] & ismallm) frame.measurements[f,op.bit] = should_flip end @@ -81,12 +82,12 @@ function apply!(frame::PauliFrame, op::sMRZ) # TODO sMRX, sMRY ismallm = lowbit<<(ismall) if op.bit != 0 - @inbounds @simd for f in eachindex(frame) # TODO thread this + @inbounds @simd for f in eachindex(frame) should_flip = !iszero(xzs[ibig,f] & ismallm) frame.measurements[f,op.bit] = should_flip end end - @inbounds @simd for f in eachindex(frame) # TODO thread this + @inbounds @simd for f in eachindex(frame) xzs[ibig,f] &= ~ismallm rand(Bool) && (xzs[end÷2+ibig,f] ⊻= ismallm) end @@ -103,7 +104,7 @@ function applynoise!(frame::PauliFrame,noise::UnbiasedUncorrelatedNoise,i::Int) ismall = _mod(T,i-1) ismallm = lowbit<<(ismall) - @inbounds @simd for f in eachindex(frame) # TODO thread this + @inbounds @simd for f in eachindex(frame) r = rand() if r < p # X error frame.frame.tab.xzs[ibig,f] ⊻= ismallm @@ -131,16 +132,9 @@ See the other methods for lower level access. Multithreading is enabled by default, but can be disabled by setting `threads=false`. Do not forget to launch Julia with multiple threads enabled, e.g. `julia -t4`, if you want to use multithreading. - -Note for advanced users: Much of the underlying QuantumClifford.jl functionaly is capable of -using Polyester.jl threads, but they are fully dissabled here as this is an embarassingly -parallel problem. If you want to use Polyester.jl threads, use the lower level methods. -The `threads` keyword argument controls whether standard Julia threads are used. """ function pftrajectories(circuit;trajectories=5000,threads=true) - Polyester.disable_polyester_threads() do - _pftrajectories(circuit;trajectories,threads) - end + _pftrajectories(circuit;trajectories,threads) end function _pftrajectories(circuit;trajectories=5000,threads=true) @@ -152,7 +146,7 @@ function _pftrajectories(circuit;trajectories=5000,threads=true) qmax=maximum((maximum(affectedqubits(g)) for g in ccircuit)) bmax=maximum((maximum(affectedbits(g),init=1) for g in ccircuit)) frames = PauliFrame(trajectories, qmax, bmax) - nthr = min(Threads.nthreads(),trajectories÷(MINBATCH1Q)) + nthr = min(Threads.nthreads(),trajectories÷(100)) if threads && nthr>1 batchsize = trajectories÷nthr Threads.@threads for i in 1:nthr diff --git a/src/symbolic_cliffords.jl b/src/symbolic_cliffords.jl index e5ee061ff..fabaa9aee 100644 --- a/src/symbolic_cliffords.jl +++ b/src/symbolic_cliffords.jl @@ -44,7 +44,7 @@ Base.@propagate_inbounds setzbit(s::TableauType{Tzv, Tme}, r::Int, c::Int, z::Tm function _apply!(stab::AbstractStabilizer, gate::G; phases::Val{B}=Val(true)) where {B, G<:AbstractSingleQubitOperator} s = tab(stab) c = gate.q - @batch per=core minbatch=MINBATCH1Q for r in eachindex(s) + @inbounds @simd for r in eachindex(s) x = getxbit(s, r, c) z = getzbit(s, r, c) x,z,phase = qubit_kernel(gate,x,z) @@ -137,7 +137,7 @@ function _apply!(stab::AbstractStabilizer, op::SingleQubitOperator; phases::Val{ sh = getshift(Tme, c) xx,zx,xz,zz = Tme.((op.xx,op.zx,op.xz,op.zz)) .<< sh anticom = ~iszero((~zz & xz & ~xx & zx) | ( zz & ~xz & xx & zx) | (zz & xz & xx & ~zx)) - @batch per=core minbatch=MINBATCH1Q for r in eachindex(s) + @inbounds @simd for r in eachindex(s) x = getxbit(s, r, c) z = getzbit(s, r, c) setxbit(s, r, c, (x&xx)⊻(z&zx)) @@ -220,7 +220,8 @@ function _apply!(stab::AbstractStabilizer, gate::G; phases::Val{B}=Val(true)) wh q2 = gate.q2 Tme = eltype(s.xzs) shift = getshift(Tme, q1) - getshift(Tme, q2) - @batch per=core minbatch=MINBATCH2Q for r in eachindex(s) + @inbounds @simd for r in eachindex(s) +# for r in eachindex(s) x1 = getxbit(s, r, q1) z1 = getzbit(s, r, q1) x2 = getxbit(s, r, q2)<