Memory layout optimizations and removing Polyester (#143)

QuantumSavory · Jul 5, 2023 · 14ee2c9 · 14ee2c9 · Krastanov · Jul 5, 2023
1 parent 0fcd5e9
commit 14ee2c9
Show file tree

Hide file tree

Showing 14 changed files with 134 additions and 41 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@
 
 # News
 
+## v0.8.10 - 2023-07-05
+
+- Remove Polyester.jl multithreading, leading to simpler and better compiled single-thread code. Now single-thread performance is much higher. Multithreading was generally not useful at the inner loops where it was deployed.
+- Implement `fastcolumn` and `fastrow`, which transform a tableau into a memory layout optimized for column or row operations.
+
 ## v0.8.9 - 2023-07-04
 
 - In the unexported experimental ECC module:

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "QuantumClifford"
 uuid = "0525e862-1e90-11e9-3e4d-1b39d7109de1"
 authors = ["Stefan Krastanov <stefan@krastanov.org>"]
-version = "0.8.9"
+version = "0.8.10"
 
 [deps]
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -13,7 +13,6 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Nemo = "2edaba10-b0f1-5616-af89-8c11ac63239a"
-Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 QuantumInterface = "5717a53b-5d69-4fa3-b976-0bf2f97ca1e5"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -40,7 +39,6 @@ MacroTools = "0.5.9"
 Makie = "0.19"
 Nemo = "0.34"
 Plots = "1.38.0"
-Polyester = "0.7"
 PrecompileTools = "1"
 Quantikz = "1.2"
 QuantumInterface = "0.3.0"

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -133,6 +133,7 @@ SUITE["circuitsim"]["pftrajectories_sumtype"]["q101_r1"]      = @benchmarkable p
 SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r1"]     = @benchmarkable pftrajectories(state,circuit) setup=(state=PauliFrame(   1, 1001, 1); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1
 SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r100"]   = @benchmarkable pftrajectories(state,circuit) setup=(state=PauliFrame( 100, 1001, 1); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1
 SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r10000"] = @benchmarkable pftrajectories(state,circuit) setup=(state=PauliFrame(1000, 1001, 1); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1
+SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r10000_fastrow"] = @benchmarkable pftrajectories(state,circuit) setup=(state=fastrow(PauliFrame(1000, 1001, 1)); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1
 SUITE["circuitsim"]["mctrajectories_sumtype"] = BenchmarkGroup(["mctrajectories_sumtype"])
 SUITE["circuitsim"]["mctrajectories_sumtype"]["q101_r1"]    = @benchmarkable mctrajectory!(state, circuit) setup=(state=Register(one(Stabilizer,  101), [false]); circuit=compactify_circuit(x_diag_circuit( 100))) evals=1
 SUITE["circuitsim"]["mctrajectories_sumtype"]["q1001_r1"]   = @benchmarkable mctrajectory!(state, circuit) setup=(state=Register(one(Stabilizer, 1001), [false]); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1

diff --git a/src/QuantumClifford.jl b/src/QuantumClifford.jl
@@ -8,12 +8,8 @@ module QuantumClifford
 # TODO Significant performance improvements: many operations do not need phase=true if the Pauli operations commute
 
 import LinearAlgebra
-using LinearAlgebra: inv, mul!, rank
+using LinearAlgebra: inv, mul!, rank, Adjoint
 using DocStringExtensions
-using Polyester
-#using LoopVectorization
-using HostCPUFeatures: pick_vector_width
-import SIMD
 
 import QuantumInterface: tensor, ⊗, tensor_pow, apply!, nqubits, expect, project!, reset_qubits!, traceout!, ptrace, apply!, projectX!, projectY!, projectZ!, entanglement_entropy
 
@@ -25,6 +21,7 @@ export
     prodphase, comm,
     nqubits,
     stabilizerview, destabilizerview, logicalxview, logicalzview, phases,
+    fastcolumn, fastrow,
     bitview, quantumstate, tab,
     BadDataStructure,
     affectedqubits, #TODO move to QuantumInterface?
@@ -91,10 +88,6 @@ function __init__()
     BIG_INT_FOUR[] = BigInt(4)
 end
 
-const MINBATCH1Q = 100
-const MINBATCH2Q = 100
-const MINBATCHDENSE = 25
-
 const NoZeroQubit = ArgumentError("Qubit indices have to be larger than zero, but you attempting are creating a gate acting on a qubit with a non-positive index. Ensure indexing always starts from 1.")
 
 # Predefined constants representing the permitted phases encoded
@@ -1276,6 +1269,7 @@ include("experimental/Experimental.jl")
 include("graphs.jl")
 include("entanglement.jl")
 include("tableau_show.jl")
+include("fastmemlayout.jl")
 include("sumtypes.jl")
 include("precompiles.jl")
 include("ecc/ECC.jl")

diff --git a/src/dense_cliffords.jl b/src/dense_cliffords.jl
@@ -127,7 +127,8 @@ function _apply!(stab::AbstractStabilizer, c::CliffordOperator; phases::Val{B}=V
     nqubits(stab)==nqubits(c) || throw(DimensionMismatch("The tableau and the Clifford operator need to act on the same number of qubits. Consider specifying an array of indices as a third argument to the `apply!` function to avoid this error."))
     s_tab = tab(stab)
     c_tab = tab(c)
-    @batch minbatch=MINBATCHDENSE threadlocal=zero(c_tab[1]) for row_stab in eachindex(s_tab)
+    threadlocal=zero(c_tab[1])
+    @inbounds @simd for row_stab in eachindex(s_tab)
         zero!(threadlocal) # a new stabrow for temporary storage
         apply_row_kernel!(threadlocal, row_stab, s_tab, c_tab, phases=phases)
     end
@@ -172,7 +173,8 @@ function _apply!(stab::AbstractStabilizer, c::CliffordOperator, indices_of_appli
     #max(indices_of_application)<=nqubits(s) || throw(DimensionMismatch("")) # Too expensive to check every time
     s_tab = tab(stab)
     c_tab = tab(c)
-    @batch minbatch=25 threadlocal=zero(c_tab[1]) for row_stab in eachindex(s_tab)
+    threadlocal=zero(c_tab[1])
+    @inbounds @simd for row_stab in eachindex(s_tab)
         zero!(threadlocal) # a new stabrow for temporary storage
         apply_row_kernel!(threadlocal, row_stab, s_tab, c_tab, indices_of_application, phases=phases)
     end

diff --git a/src/fastmemlayout.jl b/src/fastmemlayout.jl
@@ -0,0 +1,36 @@
+"""Convert a tableau to a memory layout that is fast for row operations.
+
+In this layout a Pauli string (a row of the tableau) is stored contiguously in memory.
+
+See also: [`fastrow`](@ref)"""
+function fastrow end
+
+"""Convert a tableau to a memory layout that is fast for column operations.
+
+In this layout a column of the tableau is stored (mostly) contiguously in memory.
+Due to bitpacking, e.g., packing 64 bits into a single `UInt64`,
+the memory layout is not perfectly contiguous,
+but it is still optimal given that some bitwrangling is required to extract a given bit.
+
+See also: [`fastrow`](@ref)"""
+function fastcolumn end
+
+fastrow(t::Tableau{Tzv,Tm}) where {Tzv, Tm} = t
+fastrow(t::Tableau{Tzv,Tm}) where {Tzv, Tm<:Adjoint} = Tableau(t.phases, t.nqubits, collect(t.xzs))
+fastcolumn(t::Tableau{Tzv,Tm}) where {Tzv, Tm} = Tableau(t.phases, t.nqubits, collect(t.xzs')')
+fastcolumn(t::Tableau{Tzv,Tm}) where {Tzv, Tm<:Adjoint} = t
+
+fastrow(s::Stabilizer) = Stabilizer(fastrow(s.tab))
+fastcolumn(s::Stabilizer) = Stabilizer(fastcolumn(s.tab))
+
+fastrow(s::Destabilizer) = Destabilizer(fastrow(s.tab))
+fastcolumn(s::Destabilizer) = Destabilizer(fastcolumn(s.tab))
+
+fastrow(s::MixedStabilizer) = MixedStabilizer(fastrow(s.tab), s.rank)
+fastcolumn(s::MixedStabilizer) = MixedStabilizer(fastcolumn(s.tab), s.rank)
+
+fastrow(s::MixedDestabilizer) = MixedDestabilizer(fastrow(s.tab), s.rank)
+fastcolumn(s::MixedDestabilizer) = MixedDestabilizer(fastcolumn(s.tab), s.rank)
+
+fastrow(s::PauliFrame) = PauliFrame(fastrow(s.frame), s.measurements)
+fastcolumn(s::PauliFrame) = PauliFrame(fastcolumn(s.frame), s.measurements)
diff --git a/src/mctrajectory.jl b/src/mctrajectory.jl
@@ -60,6 +60,6 @@ end
 
 See also: [`pftrajectories`](@ref), `petrajectories`"""
 function mctrajectories(initialstate,circuit;trajectories=500)
-    counts = countmap([mctrajectory!(copy(initialstate),circuit)[2] for i in 1:trajectories]) # TODO use threads or at least a generator, but without breaking Polyester
+    counts = countmap([mctrajectory!(copy(initialstate),circuit)[2] for i in 1:trajectories]) # TODO use threads or at least a generator
     return counts
 end
diff --git a/src/mul_leftright.jl b/src/mul_leftright.jl
@@ -1,8 +1,17 @@
+# using LoopVectorization
+using HostCPUFeatures: pick_vector_width
+import SIMD
+
 """Nonvectorized version of `mul_left!` used for unit tests."""
 function _mul_left_nonvec!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Bool=true) where T<:Unsigned
+    rcnt1, rcnt2 = _mul_ordered_nonvec!(r,l; phases)
+    rcnt1 ⊻ (rcnt2 << 1)
+end
+
+function _mul_ordered_nonvec!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Bool=true) where T<:Unsigned
     if !phases
         r .⊻= l
-        return zero(T)
+        return (0, 0)
     end
     cnt1 = zero(T)
     cnt2 = zero(T)
@@ -16,9 +25,41 @@ function _mul_left_nonvec!(r::AbstractVector{T}, l::AbstractVector{T}; phases::B
         cnt2 ⊻= (cnt1 ⊻ newx1 ⊻ newz1 ⊻ x1z2) & anti_comm
         cnt1 ⊻= anti_comm
     end
-    s = count_ones(cnt1)
-    s ⊻= count_ones(cnt2) << 1
-    s
+    rcnt1 = count_ones(cnt1)
+    rcnt2 = count_ones(cnt2)
+    rcnt1, rcnt2
+end
+
+#= # LoopVectorization does not support the necessary bit operations
+function mul_ordered_lv!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Val{B}=Val(true)) where {T<:Unsigned, B}
+    if !B
+        r .⊻= l
+        return (0, 0)
+    end
+    cnt1 = zero(T)
+    cnt2 = zero(T)
+    len = length(l)÷2
+    @turbo for i in 1:len
+        x1, x2, z1, z2 = l[i], r[i], l[i+len], r[i+len]
+        newx1 = x1 ⊻ x2
+        r[i] = newx1
+        newz1 = z1 ⊻ z2
+        r[i+len] = newz1
+        x1z2 = x1 & z2
+        anti_comm = (x2 & z1) ⊻ x1z2
+        cnt2 ⊻= (newx1 ⊻ newz1 ⊻ x1z2) & anti_comm
+        cnt1 ⊻= anti_comm
+    end
+    rcnt1 = count_ones(cnt1)
+    rcnt2 = count_ones(cnt2)
+    rcnt1, rcnt2
+end
+=#
+
+function mul_ordered!(r::SubArray{T,1,P,I1,L1}, l::SubArray{T,1,P,I2,L2}; phases::Val{B}=Val(true)) where {T<:Unsigned, B, I1, I2, L1, L2, P<:Adjoint}
+    # This method exists because SIMD.jl does not play well with Adjoint
+    # Delete it and try `QuantumClifford.mul_left!(fastcolumn(random_stabilizer(194)), 2, 1)` # works fine for 192
+    _mul_ordered_nonvec!(r,l; phases=B)
 end
 
 function mul_ordered!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Val{B}=Val(true)) where {T<:Unsigned, B}

diff --git a/src/pauli_frames.jl b/src/pauli_frames.jl
@@ -23,8 +23,9 @@ $(TYPEDSIGNATURES)
 Prepare an empty set of Pauli frames with the given number of `frames` and `qubits`. Preallocates spaces for `measurement` number of measurements.
 """
 function PauliFrame(frames, qubits, measurements)
-    stab = zero(Stabilizer, frames, qubits) # TODO this should really be a Tableau
-    frame = PauliFrame(stab, zeros(Bool, frames, measurements))
+    stab = fastcolumn(zero(Stabilizer, frames, qubits)) # TODO this should really be a Tableau
+    bits = zeros(Bool, frames, measurements)
+    frame = PauliFrame(stab, bits)
     initZ!(frame)
     return frame
 end
@@ -40,7 +41,7 @@ It is done automatically by most [`PauliFrame`](@ref) constructors.
 function initZ!(frame::PauliFrame)
     T = eltype(frame.frame.tab.xzs)
 
-    @inbounds @simd for f in eachindex(frame) # TODO thread this
+    @inbounds @simd for f in eachindex(frame)
         @simd for row in 1:size(frame.frame.tab.xzs,1)÷2
             frame.frame.tab.xzs[end÷2+row,f] = rand(T)
         end
@@ -63,7 +64,7 @@ function apply!(frame::PauliFrame, op::sMZ) # TODO sMX, sMY
     ismall = _mod(T,i-1)
     ismallm = lowbit<<(ismall)
 
-    @inbounds @simd for f in eachindex(frame) # TODO thread this
+    @inbounds @simd for f in eachindex(frame)
         should_flip = !iszero(xzs[ibig,f] & ismallm)
         frame.measurements[f,op.bit] = should_flip
     end
@@ -81,12 +82,12 @@ function apply!(frame::PauliFrame, op::sMRZ) # TODO sMRX, sMRY
     ismallm = lowbit<<(ismall)
 
     if op.bit != 0
-        @inbounds @simd for f in eachindex(frame) # TODO thread this
+        @inbounds @simd for f in eachindex(frame)
             should_flip = !iszero(xzs[ibig,f] & ismallm)
             frame.measurements[f,op.bit] = should_flip
         end
     end
-    @inbounds @simd for f in eachindex(frame) # TODO thread this
+    @inbounds @simd for f in eachindex(frame)
         xzs[ibig,f] &= ~ismallm
         rand(Bool) && (xzs[end÷2+ibig,f] ⊻= ismallm)
     end
@@ -103,7 +104,7 @@ function applynoise!(frame::PauliFrame,noise::UnbiasedUncorrelatedNoise,i::Int)
     ismall = _mod(T,i-1)
     ismallm = lowbit<<(ismall)
 
-    @inbounds @simd for f in eachindex(frame) # TODO thread this
+    @inbounds @simd for f in eachindex(frame)
         r = rand()
         if  r < p # X error
             frame.frame.tab.xzs[ibig,f] ⊻= ismallm
@@ -131,16 +132,9 @@ See the other methods for lower level access.
 Multithreading is enabled by default, but can be disabled by setting `threads=false`.
 Do not forget to launch Julia with multiple threads enabled, e.g. `julia -t4`, if you want
 to use multithreading.
-
-Note for advanced users: Much of the underlying QuantumClifford.jl functionaly is capable of
-using Polyester.jl threads, but they are fully dissabled here as this is an embarassingly
-parallel problem. If you want to use Polyester.jl threads, use the lower level methods.
-The `threads` keyword argument controls whether standard Julia threads are used.
 """
 function pftrajectories(circuit;trajectories=5000,threads=true)
-    Polyester.disable_polyester_threads() do
-        _pftrajectories(circuit;trajectories,threads)
-    end
+    _pftrajectories(circuit;trajectories,threads)
 end
 
 function _pftrajectories(circuit;trajectories=5000,threads=true)
@@ -152,7 +146,7 @@ function _pftrajectories(circuit;trajectories=5000,threads=true)
     qmax=maximum((maximum(affectedqubits(g)) for g in ccircuit))
     bmax=maximum((maximum(affectedbits(g),init=1) for g in ccircuit))
     frames = PauliFrame(trajectories, qmax, bmax)
-    nthr = min(Threads.nthreads(),trajectories÷(MINBATCH1Q))
+    nthr = min(Threads.nthreads(),trajectories÷(100))
     if threads && nthr>1
         batchsize = trajectories÷nthr
         Threads.@threads for i in 1:nthr

diff --git a/src/symbolic_cliffords.jl b/src/symbolic_cliffords.jl
@@ -44,7 +44,7 @@ Base.@propagate_inbounds setzbit(s::TableauType{Tzv, Tme}, r::Int, c::Int, z::Tm
 function _apply!(stab::AbstractStabilizer, gate::G; phases::Val{B}=Val(true)) where {B, G<:AbstractSingleQubitOperator}
     s = tab(stab)
     c = gate.q
-    @batch per=core minbatch=MINBATCH1Q for r in eachindex(s)
+    @inbounds @simd for r in eachindex(s)
         x = getxbit(s, r, c)
         z = getzbit(s, r, c)
         x,z,phase = qubit_kernel(gate,x,z)
@@ -137,7 +137,7 @@ function _apply!(stab::AbstractStabilizer, op::SingleQubitOperator; phases::Val{
     sh = getshift(Tme, c)
     xx,zx,xz,zz = Tme.((op.xx,op.zx,op.xz,op.zz)) .<< sh
     anticom = ~iszero((~zz & xz & ~xx & zx) | ( zz & ~xz & xx & zx) | (zz &  xz & xx & ~zx))
-    @batch per=core minbatch=MINBATCH1Q for r in eachindex(s)
+    @inbounds @simd for r in eachindex(s)
         x = getxbit(s, r, c)
         z = getzbit(s, r, c)
         setxbit(s, r, c, (x&xx)⊻(z&zx))
@@ -220,7 +220,8 @@ function _apply!(stab::AbstractStabilizer, gate::G; phases::Val{B}=Val(true)) wh
     q2 = gate.q2
     Tme = eltype(s.xzs)
     shift = getshift(Tme, q1) - getshift(Tme, q2)
-    @batch per=core minbatch=MINBATCH2Q for r in eachindex(s)
+    @inbounds @simd for r in eachindex(s)
+#    for r in eachindex(s)
         x1 = getxbit(s, r, q1)
         z1 = getzbit(s, r, q1)
         x2 = getxbit(s, r, q2)<<shift

diff --git a/test/Project.toml b/test/Project.toml
@@ -13,7 +13,6 @@ JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Nemo = "2edaba10-b0f1-5616-af89-8c11ac63239a"
-Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Quantikz = "b0d11df0-eea3-4d79-b4a5-421488cbf74b"
 QuantumInterface = "5717a53b-5d69-4fa3-b976-0bf2f97ca1e5"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -42,6 +42,7 @@ println("Starting tests with $(Threads.nthreads()) threads out of `Sys.CPU_THREA
 @doset "noisycircuits"
 @doset "syndromemeas"
 @doset "bitpack"
+@doset "memorylayout"
 @doset "graphs"
 @doset "hash"
 @doset "entanglement"

diff --git a/test/test_jet.jl b/test/test_jet.jl
@@ -4,7 +4,6 @@ using ArrayInterface
 using Static
 using Graphs
 using LinearAlgebra
-using Polyester
 
 using JET: ReportPass, BasicPass, InferenceErrorReport, UncaughtExceptionReport
 
@@ -31,7 +30,6 @@ end
             AnyFrameModule(ArrayInterface),
             AnyFrameModule(Static),
             AnyFrameModule(LinearAlgebra),
-            AnyFrameModule(Polyester)
         )
     )
     @show rep

diff --git a/test/test_memorylayout.jl b/test/test_memorylayout.jl
@@ -0,0 +1,23 @@
+using Random
+using QuantumClifford
+using Test
+
+@testset "Column-fast vs row-fast operations" begin
+    for n in [3, 300]
+        for T in (Stabilizer, Destabilizer, MixedStabilizer, MixedDestabilizer)
+            s = T(random_stabilizer(n,n))
+            @test fastrow(copy(s)) == fastrow(fastcolumn(copy(s))) # TODO should not need to convert to the same layout for comparisons
+            @test canonicalize!(fastrow(copy(s))) == fastrow(canonicalize!(fastcolumn(copy(s)))) # TODO should not need to convert to the same layout for comparisons
+            c = random_clifford(n)
+            layouts = []
+            for layout in (fastrow, fastcolumn)
+                ss = layout(copy(s))
+                @test typeof(ss) == typeof(deepcopy(ss))
+                apply!(ss, c)
+                apply!(ss, sCNOT(1,n-1))
+                push!(layouts, ss)
+            end
+            @test fastrow(copy(layouts[1])) == fastrow(copy(layouts[2]))
+        end
+    end
+end