Skip to content

Commit

Permalink
Memory layout optimizations and removing Polyester (#143)
Browse files Browse the repository at this point in the history
  • Loading branch information
Krastanov authored Jul 5, 2023
1 parent 0fcd5e9 commit 14ee2c9
Show file tree
Hide file tree
Showing 14 changed files with 134 additions and 41 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@

# News

## v0.8.10 - 2023-07-05

- Remove Polyester.jl multithreading, leading to simpler and better compiled single-thread code. Now single-thread performance is much higher. Multithreading was generally not useful at the inner loops where it was deployed.
- Implement `fastcolumn` and `fastrow`, which transform a tableau into a memory layout optimized for column or row operations.

## v0.8.9 - 2023-07-04

- In the unexported experimental ECC module:
Expand Down
4 changes: 1 addition & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "QuantumClifford"
uuid = "0525e862-1e90-11e9-3e4d-1b39d7109de1"
authors = ["Stefan Krastanov <stefan@krastanov.org>"]
version = "0.8.9"
version = "0.8.10"

[deps]
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
Expand All @@ -13,7 +13,6 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
Nemo = "2edaba10-b0f1-5616-af89-8c11ac63239a"
Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
QuantumInterface = "5717a53b-5d69-4fa3-b976-0bf2f97ca1e5"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand All @@ -40,7 +39,6 @@ MacroTools = "0.5.9"
Makie = "0.19"
Nemo = "0.34"
Plots = "1.38.0"
Polyester = "0.7"
PrecompileTools = "1"
Quantikz = "1.2"
QuantumInterface = "0.3.0"
Expand Down
1 change: 1 addition & 0 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ SUITE["circuitsim"]["pftrajectories_sumtype"]["q101_r1"] = @benchmarkable p
SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r1"] = @benchmarkable pftrajectories(state,circuit) setup=(state=PauliFrame( 1, 1001, 1); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1
SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r100"] = @benchmarkable pftrajectories(state,circuit) setup=(state=PauliFrame( 100, 1001, 1); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1
SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r10000"] = @benchmarkable pftrajectories(state,circuit) setup=(state=PauliFrame(1000, 1001, 1); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1
SUITE["circuitsim"]["pftrajectories_sumtype"]["q1001_r10000_fastrow"] = @benchmarkable pftrajectories(state,circuit) setup=(state=fastrow(PauliFrame(1000, 1001, 1)); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1
SUITE["circuitsim"]["mctrajectories_sumtype"] = BenchmarkGroup(["mctrajectories_sumtype"])
SUITE["circuitsim"]["mctrajectories_sumtype"]["q101_r1"] = @benchmarkable mctrajectory!(state, circuit) setup=(state=Register(one(Stabilizer, 101), [false]); circuit=compactify_circuit(x_diag_circuit( 100))) evals=1
SUITE["circuitsim"]["mctrajectories_sumtype"]["q1001_r1"] = @benchmarkable mctrajectory!(state, circuit) setup=(state=Register(one(Stabilizer, 1001), [false]); circuit=compactify_circuit(x_diag_circuit(1000))) evals=1
Expand Down
12 changes: 3 additions & 9 deletions src/QuantumClifford.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,8 @@ module QuantumClifford
# TODO Significant performance improvements: many operations do not need phase=true if the Pauli operations commute

import LinearAlgebra
using LinearAlgebra: inv, mul!, rank
using LinearAlgebra: inv, mul!, rank, Adjoint
using DocStringExtensions
using Polyester
#using LoopVectorization
using HostCPUFeatures: pick_vector_width
import SIMD

import QuantumInterface: tensor, , tensor_pow, apply!, nqubits, expect, project!, reset_qubits!, traceout!, ptrace, apply!, projectX!, projectY!, projectZ!, entanglement_entropy

Expand All @@ -25,6 +21,7 @@ export
prodphase, comm,
nqubits,
stabilizerview, destabilizerview, logicalxview, logicalzview, phases,
fastcolumn, fastrow,
bitview, quantumstate, tab,
BadDataStructure,
affectedqubits, #TODO move to QuantumInterface?
Expand Down Expand Up @@ -91,10 +88,6 @@ function __init__()
BIG_INT_FOUR[] = BigInt(4)
end

const MINBATCH1Q = 100
const MINBATCH2Q = 100
const MINBATCHDENSE = 25

const NoZeroQubit = ArgumentError("Qubit indices have to be larger than zero, but you attempting are creating a gate acting on a qubit with a non-positive index. Ensure indexing always starts from 1.")

# Predefined constants representing the permitted phases encoded
Expand Down Expand Up @@ -1276,6 +1269,7 @@ include("experimental/Experimental.jl")
include("graphs.jl")
include("entanglement.jl")
include("tableau_show.jl")
include("fastmemlayout.jl")
include("sumtypes.jl")
include("precompiles.jl")
include("ecc/ECC.jl")
Expand Down
6 changes: 4 additions & 2 deletions src/dense_cliffords.jl
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ function _apply!(stab::AbstractStabilizer, c::CliffordOperator; phases::Val{B}=V
nqubits(stab)==nqubits(c) || throw(DimensionMismatch("The tableau and the Clifford operator need to act on the same number of qubits. Consider specifying an array of indices as a third argument to the `apply!` function to avoid this error."))
s_tab = tab(stab)
c_tab = tab(c)
@batch minbatch=MINBATCHDENSE threadlocal=zero(c_tab[1]) for row_stab in eachindex(s_tab)
threadlocal=zero(c_tab[1])
@inbounds @simd for row_stab in eachindex(s_tab)
zero!(threadlocal) # a new stabrow for temporary storage
apply_row_kernel!(threadlocal, row_stab, s_tab, c_tab, phases=phases)
end
Expand Down Expand Up @@ -172,7 +173,8 @@ function _apply!(stab::AbstractStabilizer, c::CliffordOperator, indices_of_appli
#max(indices_of_application)<=nqubits(s) || throw(DimensionMismatch("")) # Too expensive to check every time
s_tab = tab(stab)
c_tab = tab(c)
@batch minbatch=25 threadlocal=zero(c_tab[1]) for row_stab in eachindex(s_tab)
threadlocal=zero(c_tab[1])
@inbounds @simd for row_stab in eachindex(s_tab)
zero!(threadlocal) # a new stabrow for temporary storage
apply_row_kernel!(threadlocal, row_stab, s_tab, c_tab, indices_of_application, phases=phases)
end
Expand Down
36 changes: 36 additions & 0 deletions src/fastmemlayout.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Convert a tableau to a memory layout that is fast for row operations.
In this layout a Pauli string (a row of the tableau) is stored contiguously in memory.
See also: [`fastrow`](@ref)"""
function fastrow end

"""Convert a tableau to a memory layout that is fast for column operations.
In this layout a column of the tableau is stored (mostly) contiguously in memory.
Due to bitpacking, e.g., packing 64 bits into a single `UInt64`,
the memory layout is not perfectly contiguous,
but it is still optimal given that some bitwrangling is required to extract a given bit.
See also: [`fastrow`](@ref)"""
function fastcolumn end

fastrow(t::Tableau{Tzv,Tm}) where {Tzv, Tm} = t
fastrow(t::Tableau{Tzv,Tm}) where {Tzv, Tm<:Adjoint} = Tableau(t.phases, t.nqubits, collect(t.xzs))
fastcolumn(t::Tableau{Tzv,Tm}) where {Tzv, Tm} = Tableau(t.phases, t.nqubits, collect(t.xzs')')
fastcolumn(t::Tableau{Tzv,Tm}) where {Tzv, Tm<:Adjoint} = t

fastrow(s::Stabilizer) = Stabilizer(fastrow(s.tab))
fastcolumn(s::Stabilizer) = Stabilizer(fastcolumn(s.tab))

fastrow(s::Destabilizer) = Destabilizer(fastrow(s.tab))
fastcolumn(s::Destabilizer) = Destabilizer(fastcolumn(s.tab))

fastrow(s::MixedStabilizer) = MixedStabilizer(fastrow(s.tab), s.rank)
fastcolumn(s::MixedStabilizer) = MixedStabilizer(fastcolumn(s.tab), s.rank)

fastrow(s::MixedDestabilizer) = MixedDestabilizer(fastrow(s.tab), s.rank)
fastcolumn(s::MixedDestabilizer) = MixedDestabilizer(fastcolumn(s.tab), s.rank)

fastrow(s::PauliFrame) = PauliFrame(fastrow(s.frame), s.measurements)
fastcolumn(s::PauliFrame) = PauliFrame(fastcolumn(s.frame), s.measurements)
2 changes: 1 addition & 1 deletion src/mctrajectory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,6 @@ end
See also: [`pftrajectories`](@ref), `petrajectories`"""
function mctrajectories(initialstate,circuit;trajectories=500)
counts = countmap([mctrajectory!(copy(initialstate),circuit)[2] for i in 1:trajectories]) # TODO use threads or at least a generator, but without breaking Polyester
counts = countmap([mctrajectory!(copy(initialstate),circuit)[2] for i in 1:trajectories]) # TODO use threads or at least a generator
return counts
end
49 changes: 45 additions & 4 deletions src/mul_leftright.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
# using LoopVectorization
using HostCPUFeatures: pick_vector_width
import SIMD

"""Nonvectorized version of `mul_left!` used for unit tests."""
function _mul_left_nonvec!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Bool=true) where T<:Unsigned
rcnt1, rcnt2 = _mul_ordered_nonvec!(r,l; phases)
rcnt1 (rcnt2 << 1)
end

function _mul_ordered_nonvec!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Bool=true) where T<:Unsigned
if !phases
r .⊻= l
return zero(T)
return (0, 0)
end
cnt1 = zero(T)
cnt2 = zero(T)
Expand All @@ -16,9 +25,41 @@ function _mul_left_nonvec!(r::AbstractVector{T}, l::AbstractVector{T}; phases::B
cnt2 ⊻= (cnt1 newx1 newz1 x1z2) & anti_comm
cnt1 ⊻= anti_comm
end
s = count_ones(cnt1)
s ⊻= count_ones(cnt2) << 1
s
rcnt1 = count_ones(cnt1)
rcnt2 = count_ones(cnt2)
rcnt1, rcnt2
end

#= # LoopVectorization does not support the necessary bit operations
function mul_ordered_lv!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Val{B}=Val(true)) where {T<:Unsigned, B}
if !B
r .⊻= l
return (0, 0)
end
cnt1 = zero(T)
cnt2 = zero(T)
len = length(l)÷2
@turbo for i in 1:len
x1, x2, z1, z2 = l[i], r[i], l[i+len], r[i+len]
newx1 = x1 ⊻ x2
r[i] = newx1
newz1 = z1 ⊻ z2
r[i+len] = newz1
x1z2 = x1 & z2
anti_comm = (x2 & z1) ⊻ x1z2
cnt2 ⊻= (newx1 ⊻ newz1 ⊻ x1z2) & anti_comm
cnt1 ⊻= anti_comm
end
rcnt1 = count_ones(cnt1)
rcnt2 = count_ones(cnt2)
rcnt1, rcnt2
end
=#

function mul_ordered!(r::SubArray{T,1,P,I1,L1}, l::SubArray{T,1,P,I2,L2}; phases::Val{B}=Val(true)) where {T<:Unsigned, B, I1, I2, L1, L2, P<:Adjoint}
# This method exists because SIMD.jl does not play well with Adjoint
# Delete it and try `QuantumClifford.mul_left!(fastcolumn(random_stabilizer(194)), 2, 1)` # works fine for 192
_mul_ordered_nonvec!(r,l; phases=B)
end

function mul_ordered!(r::AbstractVector{T}, l::AbstractVector{T}; phases::Val{B}=Val(true)) where {T<:Unsigned, B}
Expand Down
26 changes: 10 additions & 16 deletions src/pauli_frames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ $(TYPEDSIGNATURES)
Prepare an empty set of Pauli frames with the given number of `frames` and `qubits`. Preallocates spaces for `measurement` number of measurements.
"""
function PauliFrame(frames, qubits, measurements)
stab = zero(Stabilizer, frames, qubits) # TODO this should really be a Tableau
frame = PauliFrame(stab, zeros(Bool, frames, measurements))
stab = fastcolumn(zero(Stabilizer, frames, qubits)) # TODO this should really be a Tableau
bits = zeros(Bool, frames, measurements)
frame = PauliFrame(stab, bits)
initZ!(frame)
return frame
end
Expand All @@ -40,7 +41,7 @@ It is done automatically by most [`PauliFrame`](@ref) constructors.
function initZ!(frame::PauliFrame)
T = eltype(frame.frame.tab.xzs)

@inbounds @simd for f in eachindex(frame) # TODO thread this
@inbounds @simd for f in eachindex(frame)
@simd for row in 1:size(frame.frame.tab.xzs,1)÷2
frame.frame.tab.xzs[end÷2+row,f] = rand(T)
end
Expand All @@ -63,7 +64,7 @@ function apply!(frame::PauliFrame, op::sMZ) # TODO sMX, sMY
ismall = _mod(T,i-1)
ismallm = lowbit<<(ismall)

@inbounds @simd for f in eachindex(frame) # TODO thread this
@inbounds @simd for f in eachindex(frame)
should_flip = !iszero(xzs[ibig,f] & ismallm)
frame.measurements[f,op.bit] = should_flip
end
Expand All @@ -81,12 +82,12 @@ function apply!(frame::PauliFrame, op::sMRZ) # TODO sMRX, sMRY
ismallm = lowbit<<(ismall)

if op.bit != 0
@inbounds @simd for f in eachindex(frame) # TODO thread this
@inbounds @simd for f in eachindex(frame)
should_flip = !iszero(xzs[ibig,f] & ismallm)
frame.measurements[f,op.bit] = should_flip
end
end
@inbounds @simd for f in eachindex(frame) # TODO thread this
@inbounds @simd for f in eachindex(frame)
xzs[ibig,f] &= ~ismallm
rand(Bool) && (xzs[end÷2+ibig,f] ⊻= ismallm)
end
Expand All @@ -103,7 +104,7 @@ function applynoise!(frame::PauliFrame,noise::UnbiasedUncorrelatedNoise,i::Int)
ismall = _mod(T,i-1)
ismallm = lowbit<<(ismall)

@inbounds @simd for f in eachindex(frame) # TODO thread this
@inbounds @simd for f in eachindex(frame)
r = rand()
if r < p # X error
frame.frame.tab.xzs[ibig,f] ⊻= ismallm
Expand Down Expand Up @@ -131,16 +132,9 @@ See the other methods for lower level access.
Multithreading is enabled by default, but can be disabled by setting `threads=false`.
Do not forget to launch Julia with multiple threads enabled, e.g. `julia -t4`, if you want
to use multithreading.
Note for advanced users: Much of the underlying QuantumClifford.jl functionaly is capable of
using Polyester.jl threads, but they are fully dissabled here as this is an embarassingly
parallel problem. If you want to use Polyester.jl threads, use the lower level methods.
The `threads` keyword argument controls whether standard Julia threads are used.
"""
function pftrajectories(circuit;trajectories=5000,threads=true)
Polyester.disable_polyester_threads() do
_pftrajectories(circuit;trajectories,threads)
end
_pftrajectories(circuit;trajectories,threads)
end

function _pftrajectories(circuit;trajectories=5000,threads=true)
Expand All @@ -152,7 +146,7 @@ function _pftrajectories(circuit;trajectories=5000,threads=true)
qmax=maximum((maximum(affectedqubits(g)) for g in ccircuit))
bmax=maximum((maximum(affectedbits(g),init=1) for g in ccircuit))
frames = PauliFrame(trajectories, qmax, bmax)
nthr = min(Threads.nthreads(),trajectories÷(MINBATCH1Q))
nthr = min(Threads.nthreads(),trajectories÷(100))
if threads && nthr>1
batchsize = trajectories÷nthr
Threads.@threads for i in 1:nthr
Expand Down
7 changes: 4 additions & 3 deletions src/symbolic_cliffords.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Base.@propagate_inbounds setzbit(s::TableauType{Tzv, Tme}, r::Int, c::Int, z::Tm
function _apply!(stab::AbstractStabilizer, gate::G; phases::Val{B}=Val(true)) where {B, G<:AbstractSingleQubitOperator}
s = tab(stab)
c = gate.q
@batch per=core minbatch=MINBATCH1Q for r in eachindex(s)
@inbounds @simd for r in eachindex(s)
x = getxbit(s, r, c)
z = getzbit(s, r, c)
x,z,phase = qubit_kernel(gate,x,z)
Expand Down Expand Up @@ -137,7 +137,7 @@ function _apply!(stab::AbstractStabilizer, op::SingleQubitOperator; phases::Val{
sh = getshift(Tme, c)
xx,zx,xz,zz = Tme.((op.xx,op.zx,op.xz,op.zz)) .<< sh
anticom = ~iszero((~zz & xz & ~xx & zx) | ( zz & ~xz & xx & zx) | (zz & xz & xx & ~zx))
@batch per=core minbatch=MINBATCH1Q for r in eachindex(s)
@inbounds @simd for r in eachindex(s)
x = getxbit(s, r, c)
z = getzbit(s, r, c)
setxbit(s, r, c, (x&xx)(z&zx))
Expand Down Expand Up @@ -220,7 +220,8 @@ function _apply!(stab::AbstractStabilizer, gate::G; phases::Val{B}=Val(true)) wh
q2 = gate.q2
Tme = eltype(s.xzs)
shift = getshift(Tme, q1) - getshift(Tme, q2)
@batch per=core minbatch=MINBATCH2Q for r in eachindex(s)
@inbounds @simd for r in eachindex(s)
# for r in eachindex(s)
x1 = getxbit(s, r, q1)
z1 = getzbit(s, r, q1)
x2 = getxbit(s, r, q2)<<shift
Expand Down
1 change: 0 additions & 1 deletion test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
Nemo = "2edaba10-b0f1-5616-af89-8c11ac63239a"
Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
Quantikz = "b0d11df0-eea3-4d79-b4a5-421488cbf74b"
QuantumInterface = "5717a53b-5d69-4fa3-b976-0bf2f97ca1e5"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand Down
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ println("Starting tests with $(Threads.nthreads()) threads out of `Sys.CPU_THREA
@doset "noisycircuits"
@doset "syndromemeas"
@doset "bitpack"
@doset "memorylayout"
@doset "graphs"
@doset "hash"
@doset "entanglement"
Expand Down
2 changes: 0 additions & 2 deletions test/test_jet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ using ArrayInterface
using Static
using Graphs
using LinearAlgebra
using Polyester

using JET: ReportPass, BasicPass, InferenceErrorReport, UncaughtExceptionReport

Expand All @@ -31,7 +30,6 @@ end
AnyFrameModule(ArrayInterface),
AnyFrameModule(Static),
AnyFrameModule(LinearAlgebra),
AnyFrameModule(Polyester)
)
)
@show rep
Expand Down
23 changes: 23 additions & 0 deletions test/test_memorylayout.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using Random
using QuantumClifford
using Test

@testset "Column-fast vs row-fast operations" begin
for n in [3, 300]
for T in (Stabilizer, Destabilizer, MixedStabilizer, MixedDestabilizer)
s = T(random_stabilizer(n,n))
@test fastrow(copy(s)) == fastrow(fastcolumn(copy(s))) # TODO should not need to convert to the same layout for comparisons
@test canonicalize!(fastrow(copy(s))) == fastrow(canonicalize!(fastcolumn(copy(s)))) # TODO should not need to convert to the same layout for comparisons
c = random_clifford(n)
layouts = []
for layout in (fastrow, fastcolumn)
ss = layout(copy(s))
@test typeof(ss) == typeof(deepcopy(ss))
apply!(ss, c)
apply!(ss, sCNOT(1,n-1))
push!(layouts, ss)
end
@test fastrow(copy(layouts[1])) == fastrow(copy(layouts[2]))
end
end
end

2 comments on commit 14ee2c9

@Krastanov
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/86865

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.8.10 -m "<description of version>" 14ee2c9b07f5b443d2bd94415606ffde8e435bb2
git push origin v0.8.10

Please sign in to comment.